summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/aio.c2
-rw-r--r--fs/autofs4/expire.c8
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c10
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c120
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c486
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c340
-rw-r--r--fs/btrfs/extent_io.c43
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c150
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c504
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/buffer.c55
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/rdwr.c30
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c103
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c25
-rw-r--r--fs/ceph/ioctl.c4
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_debug.c52
-rw-r--r--fs/cifs/cifs_unicode.h8
-rw-r--r--fs/cifs/cifsencrypt.c189
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h69
-rw-r--r--fs/cifs/cifspdu.h17
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c425
-rw-r--r--fs/cifs/connect.c172
-rw-r--r--fs/cifs/dir.c14
-rw-r--r--fs/cifs/file.c60
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/link.c84
-rw-r--r--fs/cifs/misc.c3
-rw-r--r--fs/cifs/readdir.c37
-rw-r--r--fs/cifs/sess.c101
-rw-r--r--fs/cifs/smb1ops.c53
-rw-r--r--fs/cifs/smb2file.c24
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c57
-rw-r--r--fs/cifs/smb2misc.c4
-rw-r--r--fs/cifs/smb2ops.c102
-rw-r--r--fs/cifs/smb2pdu.c502
-rw-r--r--fs/cifs/smb2pdu.h114
-rw-r--r--fs/cifs/smb2proto.h20
-rw-r--r--fs/cifs/smb2transport.c246
-rw-r--r--fs/cifs/smbfsctl.h27
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/configfs/dir.c15
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c121
-rw-r--r--fs/dcache.c11
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/debugfs/inode.c69
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/ecryptfs/crypto.c337
-rw-r--r--fs/ecryptfs/file.c7
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/inode.c14
-rw-r--r--fs/eventpoll.c16
-rw-r--r--fs/exec.c23
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/inode.c10
-rw-r--r--fs/ext3/namei.c9
-rw-r--r--fs/ext3/super.c56
-rw-r--r--fs/ext4/balloc.c18
-rw-r--r--fs/ext4/ext4.h206
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c473
-rw-r--r--fs/ext4/extents_status.c218
-rw-r--r--fs/ext4/extents_status.h55
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c64
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c1797
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--fs/ext4/mballoc.c32
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c5
-rw-r--r--fs/ext4/namei.c44
-rw-r--r--fs/ext4/page-io.c336
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c170
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c117
-rw-r--r--fs/f2fs/data.c80
-rw-r--r--fs/f2fs/debug.c38
-rw-r--r--fs/f2fs/dir.c148
-rw-r--r--fs/f2fs/f2fs.h99
-rw-r--r--fs/f2fs/file.c69
-rw-r--r--fs/f2fs/gc.c83
-rw-r--r--fs/f2fs/gc.h36
-rw-r--r--fs/f2fs/inode.c19
-rw-r--r--fs/f2fs/namei.c37
-rw-r--r--fs/f2fs/node.c47
-rw-r--r--fs/f2fs/node.h106
-rw-r--r--fs/f2fs/recovery.c152
-rw-r--r--fs/f2fs/segment.c130
-rw-r--r--fs/f2fs/super.c423
-rw-r--r--fs/f2fs/xattr.c68
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file_table.c31
-rw-r--r--fs/fs-writeback.c19
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/dir.c51
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/dir.c26
-rw-r--r--fs/gfs2/dir.h3
-rw-r--r--fs/gfs2/file.c69
-rw-r--r--fs/gfs2/glops.c26
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c78
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c40
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c22
-rw-r--r--fs/gfs2/meta_io.h26
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c14
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hpfs/buffer.c33
-rw-r--r--fs/hpfs/file.c40
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/map.c22
-rw-r--r--fs/hpfs/super.c17
-rw-r--r--fs/hppfs/hppfs.c11
-rw-r--r--fs/isofs/inode.c16
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c37
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c69
-rw-r--r--fs/jfs/jfs_metapage.c10
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c22
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/libfs.c3
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/clntproc.c5
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/locks.c71
-rw-r--r--fs/logfs/dev_mtd.c2
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/logfs/super.c6
-rw-r--r--fs/namei.c12
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c52
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/dir.c98
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/file.c38
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c149
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h12
-rw-r--r--fs/nfs/nfs4client.c18
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c3
-rw-r--r--fs/nfs/nfs4filelayout.h3
-rw-r--r--fs/nfs/nfs4filelayoutdev.c8
-rw-r--r--fs/nfs/nfs4proc.c733
-rw-r--r--fs/nfs/nfs4session.c42
-rw-r--r--fs/nfs/nfs4session.h9
-rw-r--r--fs/nfs/nfs4state.c114
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c288
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/super.c215
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/nfs4proc.c46
-rw-r--r--fs/nfsd/nfs4state.c238
-rw-r--r--fs/nfsd/nfs4xdr.c169
-rw-r--r--fs/nfsd/nfsd.h27
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/alloc.c63
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/ifile.c22
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c89
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/cluster/heartbeat.c59
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c29
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c8
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c14
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/namei.c70
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/ocfs2/suballoc.c37
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c18
-rw-r--r--fs/open.c8
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c145
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c694
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/platform.c21
-rw-r--r--fs/pstore/ram.c5
-rw-r--r--fs/pstore/ram_core.c54
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/procfs.c99
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/select.c66
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/splice.c1
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/group.c70
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/super.c342
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_alloc.c24
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c199
-rw-r--r--fs/xfs/xfs_bmap.h1
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_buf_item.c87
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dinode.h6
-rw-r--r--fs/xfs/xfs_dir2_block.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/xfs_dquot.c31
-rw-r--r--fs/xfs/xfs_dquot.h11
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c74
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c105
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_ioctl.c16
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_iops.c27
-rw-r--r--fs/xfs/xfs_itable.c33
-rw-r--r--fs/xfs/xfs_log.c24
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_recover.c127
-rw-r--r--fs/xfs/xfs_mount.c156
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_qm.c454
-rw-r--r--fs/xfs/xfs_qm.h97
-rw-r--r--fs/xfs/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/xfs_qm_syscalls.c110
-rw-r--r--fs/xfs/xfs_quota.h104
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h13
-rw-r--r--fs/xfs/xfs_super.c64
-rw-r--r--fs/xfs/xfs_symlink.c61
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.h20
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_buf.c34
-rw-r--r--fs/xfs/xfs_trans_dquot.c122
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c28
395 files changed, 15351 insertions, 9410 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
endif
+
+
+config 9P_FS_SECURITY
+ bool "9P Security Labels"
+ depends on 9P_FS
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the 9P filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
v9fs.o \
fid.o \
xattr.o \
- xattr_user.o
+ xattr_user.o \
+ xattr_trusted.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
+9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
* @offset: offset in the page
*/
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
v9fs_fscache_invalidate_page(page);
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..94de6d1482e2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -146,7 +146,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
char type = 0, ext[32];
int major = -1, minor = -1;
- strncpy(ext, stat->extension, sizeof(ext));
+ strlcpy(ext, stat->extension, sizeof(ext));
sscanf(ext, "%c %u %u", &type, &major, &minor);
switch (type) {
case 'c':
@@ -1054,13 +1054,11 @@ static int
v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_wstat *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
- err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
generic_fillattr(dentry->d_inode, stat);
@@ -1188,7 +1186,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
* this even with .u extension. So check
* for non NULL stat->extension
*/
- strncpy(ext, stat->extension, sizeof(ext));
+ strlcpy(ext, stat->extension, sizeof(ext));
/* HARDLINKCOUNT %u */
sscanf(ext, "%13s %u", tag_name, &i_nlink);
if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
+ &v9fs_xattr_trusted_handler,
#ifdef CONFIG_9P_FS_POSIX_ACL
&v9fs_xattr_acl_access_handler,
&v9fs_xattr_acl_default_handler,
#endif
+#ifdef CONFIG_9P_FS_SECURITY
+ &v9fs_xattr_security_handler,
+#endif
NULL
};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
extern const struct xattr_handler *v9fs_xattr_handlers[];
extern struct xattr_handler v9fs_xattr_user_handler;
+extern struct xattr_handler v9fs_xattr_trusted_handler;
+extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(full_name+prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+ kfree(full_name);
+ return retval;
+}
+
+static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(full_name + prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+ kfree(full_name);
+ return retval;
+}
+
+struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_security_get,
+ .set = v9fs_xattr_security_set,
+};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(full_name+prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+ kfree(full_name);
+ return retval;
+}
+
+static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(full_name + prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+ kfree(full_name);
+ return retval;
+}
+
+struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_trusted_get,
+ .set = v9fs_xattr_trusted_set,
+};
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
#include "internal.h"
static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
- _enter("{%lu},%lu", page->index, offset);
+ _enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0) {
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/aio.c b/fs/aio.c
index a8ecc8313fb0..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -625,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
/*
* Add a completion event to the ring buffer. Must be done holding
- * ctx->ctx_lock to prevent other code from messing with the tail
+ * ctx->completion_lock to prevent other code from messing with the tail
* pointer since we might be called from irq context.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
/* Already gone or negative dentry (under construction) - try next */
- if (q->d_count == 0 || !simple_positive(q)) {
+ if (!d_count(q) || !simple_positive(q)) {
spin_unlock(&q->d_lock);
next = q->d_u.d_child.next;
goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
else
ino_count++;
- if (p->d_count > ino_count) {
+ if (d_count(p) > ino_count) {
top_ino->last_used = jiffies;
dput(p);
return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
if (!exp_leaves) {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
} else {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
- if (dentry->d_count > ino_count)
+ if (d_count(dentry) > ino_count)
goto next;
expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ca8e55548d98..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
spin_lock(&active->d_lock);
/* Already gone? */
- if (active->d_count == 0)
+ if (!d_count(active))
goto next;
qstr = &active->d_name;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 41aea03698b6..b3192da97cab 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -254,8 +254,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_data = N_DATADDR(ex));
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02fb679a5823..7d863a4de5a5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- current->mm->free_area_cache = current->mm->mmap_base;
- current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 431b6a04ebfd..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
struct backing_dev_info *old = inode->i_data.backing_dev_info;
+ bool wakeup_bdi = false;
if (unlikely(dst == old)) /* deadlock avoidance */
return;
bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
- if (inode->i_state & I_DIRTY)
+ if (inode->i_state & I_DIRTY) {
+ if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+ wakeup_bdi = true;
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ }
spin_unlock(&inode->i_lock);
spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock);
+
+ if (wakeup_bdi)
+ bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -1562,6 +1569,7 @@ static const struct address_space_operations def_blk_aops = {
.writepages = generic_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
};
const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* to a logical address
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- u64 time_seq,
- struct __prelim_ref *ref,
- struct ulist *parents,
- const u64 *extent_item_pos)
+ struct btrfs_path *path, u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
- struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int root_level;
int level = ref->level;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
-
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
time_seq, ref->wanted_disk_byte,
extent_item_pos);
out:
- btrfs_free_path(path);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
return ret;
}
@@ -322,7 +316,7 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root, u64 time_seq,
+ struct btrfs_path *path, u64 time_seq,
struct list_head *head,
const u64 *extent_item_pos)
{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
- err = __resolve_indirect_ref(fs_info, search_commit_root,
- time_seq, ref, parents,
- extent_item_pos);
+ err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+ parents, extent_item_pos);
if (err == -ENOMEM)
goto out;
if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
int slot;
struct extent_buffer *leaf;
struct btrfs_key key;
+ struct btrfs_key found_key;
unsigned long ptr;
unsigned long end;
struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
end = (unsigned long)ei + item_size;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
*info_level = btrfs_tree_block_level(leaf, info);
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
+ } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ *info_level = found_key.offset;
} else {
BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
}
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int info_level = 0;
int ret;
- int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
struct list_head prefs_delayed;
struct list_head prefs;
struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&prefs_delayed);
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)-1;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
+ if (!trans)
+ path->search_commit_root = 1;
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
goto out;
BUG_ON(ret == 0);
- if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ if (trans) {
/*
* look if there are updates for this ref queued and lock the
* head
@@ -869,7 +870,8 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == bytenr &&
- key.type == BTRFS_EXTENT_ITEM_KEY) {
+ (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs);
if (ret)
@@ -890,8 +892,8 @@ again:
__merge_refs(&prefs, 1);
- ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
- &prefs, extent_item_pos);
+ ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+ extent_item_pos);
if (ret)
goto out;
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
{
int ret;
u64 flags;
+ u64 size = 0;
u32 item_size;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
- key.type = BTRFS_EXTENT_ITEM_KEY;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
key.offset = (u64)-1;
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
return ret;
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
- if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+ if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+ size = fs_info->extent_root->leafsize;
+ else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+ size = found_key->offset;
+
+ if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+ found_key->type != BTRFS_METADATA_ITEM_KEY) ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical) {
+ found_key->objectid + size <= logical) {
pr_debug("logical %llu is not within any extent\n",
(unsigned long long)logical);
return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct ulist *refs = NULL;
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
- if (search_commit_root) {
- trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
- } else {
+ if (!search_commit_root) {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
#include "ulist.h"
#include "extent_io.h"
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02fae7f7e42c..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- tree_mod_log_free_eb(root->fs_info, buf);
+ if (last_ref)
+ tree_mod_log_free_eb(root->fs_info, buf);
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
}
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
* time_seq).
*/
static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
- struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq, struct tree_mod_elem *first_tm)
{
u32 n;
struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
+ tree_mod_log_read_lock(fs_info);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
if (tm->index != first_tm->index)
break;
}
+ tree_mod_log_read_unlock(fs_info);
btrfs_set_header_nritems(eb, n);
}
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
extent_buffer_get(eb_rewin);
btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+ __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_generation(eb, old_generation);
}
if (tm)
- __tree_mod_log_rewind(eb, time_seq, tm);
+ __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
}
}
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
{
int slot;
int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int ret = 0;
int blocksize;
parent = path->nodes[level + 1];
if (!parent)
- return 0;
+ return;
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block2 = 0;
free_extent_buffer(eb);
}
- if (block1 || block2) {
- ret = -EAGAIN;
-
- /* release the whole path */
- btrfs_release_path(path);
-
- /* read the blocks */
- if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
- if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
- if (block1) {
- eb = read_tree_block(root, block1, blocksize, 0);
- free_extent_buffer(eb);
- }
- if (block2) {
- eb = read_tree_block(root, block2, blocksize, 0);
- free_extent_buffer(eb);
- }
- }
- return ret;
+ if (block1)
+ readahead_tree_block(root, block1, blocksize, 0);
+ if (block2)
+ readahead_tree_block(root, block2, blocksize, 0);
}
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
if (tmp) {
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
- /*
- * we found an up to date block without
- * sleeping, return
- * right away
- */
- *eb_ret = tmp;
- return 0;
- }
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- free_extent_buffer(tmp);
- btrfs_set_path_blocking(p);
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ *eb_ret = tmp;
+ return 0;
+ }
- /* now we're allowed to do a blocking uptodate check */
- tmp = read_tree_block(root, blocknr, blocksize, gen);
- if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
- *eb_ret = tmp;
- return 0;
- }
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ /* the pages were up to date, but we failed
+ * the generation number check. Do a full
+ * read for the generation number that is correct.
+ * We must do this without dropping locks so
+ * we can trust our generation number
+ */
+ btrfs_set_path_blocking(p);
+
+ /* now we're allowed to do a blocking uptodate check */
+ ret = btrfs_read_buffer(tmp, gen);
+ if (!ret) {
+ *eb_ret = tmp;
+ return 0;
}
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
/*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = split_node(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
goto again;
}
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
sret = balance_level(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
*/
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path, int level, int log_removal)
+ struct btrfs_path *path, int level)
{
u64 lower_gen;
struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- tree_mod_log_set_root_pointer(root, c, log_removal);
+ tree_mod_log_set_root_pointer(root, c, 0);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
/*
* trying to split the root, lets make a new one
*
- * tree mod log: We pass 0 as log_removal parameter to
+ * tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
* elements below with tree_mod_log_eb_copy. We're holding a
* tree lock on the buffer, which is why we cannot race with
* other tree_mod_log users.
*/
- ret = insert_new_root(trans, root, path, level + 1, 0);
+ ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
} else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
- if (data_size) {
+ if (data_size && path->nodes[1]) {
wret = push_leaf_right(trans, root, path, data_size,
data_size, 0, 0);
if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
}
if (!path->nodes[1]) {
- ret = insert_new_root(trans, root, path, 1, 1);
+ ret = insert_new_root(trans, root, path, 1);
if (ret)
return ret;
}
@@ -4430,7 +4398,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
}
/*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
*/
void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
u32 data_size)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
-#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
+#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
account */
/*
+ * bytes_pinned is kept in line with what is actually pinned, as in
+ * we've called update_block_group and dropped the bytes_used counter
+ * and increased the bytes_pinned counter. However this means that
+ * bytes_pinned does not reflect the bytes that will be pinned once the
+ * delayed refs are flushed, so this counter is inc'ed everytime we call
+ * btrfs_free_extent so it is a realtime count of what will be freed
+ * once the transaction is committed. It will be zero'ed everytime the
+ * transaction commits.
+ */
+ struct percpu_counter total_bytes_pinned;
+
+ /*
* we bump reservation progress every time we decrement
* bytes_reserved. This way people waiting for reservations
* know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
atomic_t open_ioctl_trans;
/*
- * this is used by the balancing code to wait for all the pending
- * ordered extents
+ * this is used to protect the following list -- ordered_roots.
*/
- spinlock_t ordered_extent_lock;
+ spinlock_t ordered_root_lock;
/*
- * all of the data=ordered extents pending writeback
+ * all fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
* these can span multiple transactions and basically include
* every dirty data page that isn't from nodatacow
*/
- struct list_head ordered_extents;
+ struct list_head ordered_roots;
- spinlock_t delalloc_lock;
- /*
- * all of the inodes that have delalloc bytes. It is possible for
- * this list to be empty even when there is still dirty data=ordered
- * extents waiting to finish IO.
- */
- struct list_head delalloc_inodes;
+ spinlock_t delalloc_root_lock;
+ /* all fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
/*
* there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
- int enospc_unlink;
- int trans_no_join;
u64 total_pinned;
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
struct rb_root qgroup_tree;
spinlock_t qgroup_lock;
+ /*
+ * used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
/* protect user change for quota operations */
struct mutex qgroup_ioctl_lock;
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
struct mutex qgroup_rescan_lock; /* protects the progress item */
struct btrfs_key qgroup_rescan_progress;
struct btrfs_workers qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
/* filesystem state */
unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
int force_cow;
spinlock_t root_item_lock;
+ atomic_t refs;
+
+ spinlock_t delalloc_lock;
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+ struct list_head delalloc_root;
+ u64 nr_delalloc_inodes;
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+ struct list_head ordered_root;
+ u64 nr_ordered_extents;
};
struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
num_items;
}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+ struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+ return (root->fs_info->sb->s_flags & MS_RDONLY ||
+ btrfs_fs_closing(root->fs_info));
+}
+
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *item);
void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
- btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes);
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
return next;
}
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
- u64 root_id)
-{
- struct btrfs_key root_key;
-
- if (root->objectid == root_id)
- return root;
-
- root_key.objectid = root_id;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1191,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
+ root->nr_delalloc_inodes = 0;
+ root->nr_ordered_extents = 0;
root->name = NULL;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1199,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->delalloc_inodes);
+ INIT_LIST_HEAD(&root->delalloc_root);
+ INIT_LIST_HEAD(&root->ordered_extents);
+ INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->delalloc_lock);
+ spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
@@ -1216,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
+ atomic_set(&root->refs, 1);
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1234,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->root_item_lock);
}
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
- struct btrfs_fs_info *fs_info,
- u64 objectid,
- struct btrfs_root *root)
-{
- int ret;
- u32 blocksize;
- u64 generation;
-
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
- ret = btrfs_find_last_root(tree_root, objectid,
- &root->root_item, &root->root_key);
- if (ret > 0)
- return -ENOENT;
- else if (ret < 0)
- return ret;
-
- generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
- root->commit_root = NULL;
- root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
- if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
- free_extent_buffer(root->node);
- root->node = NULL;
- return -EIO;
- }
- root->commit_root = btrfs_root_node(root);
- return 0;
-}
-
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1451,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *l;
u64 generation;
u32 blocksize;
- int ret = 0;
- int slot;
+ int ret;
- root = btrfs_alloc_root(fs_info);
- if (!root)
+ path = btrfs_alloc_path();
+ if (!path)
return ERR_PTR(-ENOMEM);
- if (location->offset == (u64)-1) {
- ret = find_and_setup_root(tree_root, fs_info,
- location->objectid, root);
- if (ret) {
- kfree(root);
- return ERR_PTR(ret);
- }
- goto out;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root) {
+ ret = -ENOMEM;
+ goto alloc_fail;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, location->objectid);
+ root, fs_info, key->objectid);
- path = btrfs_alloc_path();
- if (!path) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret == 0) {
- l = path->nodes[0];
- slot = path->slots[0];
- btrfs_read_root_item(l, slot, &root->root_item);
- memcpy(&root->root_key, location, sizeof(*location));
- }
- btrfs_free_path(path);
+ ret = btrfs_find_root(tree_root, key, path,
+ &root->root_item, &root->root_key);
if (ret) {
- kfree(root);
if (ret > 0)
ret = -ENOENT;
- return ERR_PTR(ret);
+ goto find_fail;
}
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- if (!root->node || !extent_buffer_uptodate(root->node)) {
- ret = (!root->node) ? -ENOMEM : -EIO;
-
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
+ if (!root->node) {
+ ret = -ENOMEM;
+ goto find_fail;
+ } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ ret = -EIO;
+ goto read_fail;
}
-
root->commit_root = btrfs_root_node(root);
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_free_path(path);
+ return root;
+
+read_fail:
+ free_extent_buffer(root->node);
+find_fail:
+ kfree(root);
+alloc_fail:
+ root = ERR_PTR(ret);
+ goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_read_tree_root(tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1522,6 +1502,66 @@ out:
return root;
}
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+ int ret;
+
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned || !root->free_ino_ctl) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ btrfs_init_free_ino_ctl(root);
+ mutex_init(&root->fs_commit_mutex);
+ spin_lock_init(&root->cache_lock);
+ init_waitqueue_head(&root->cache_wait);
+
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ return 0;
+fail:
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+ return ret;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
+{
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+
+ return ret;
+}
+
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
@@ -1542,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->quota_root ? fs_info->quota_root :
ERR_PTR(-ENOENT);
again:
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)location->objectid);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root)
return root;
- root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ root = btrfs_read_fs_root(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
goto fail;
}
- btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
-
- ret = get_anon_bdev(&root->anon_dev);
+ ret = btrfs_init_fs_root(root);
if (ret)
goto fail;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = -ENOENT;
- goto fail;
- }
-
ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
if (ret < 0)
goto fail;
if (ret == 0)
root->orphan_item_inserted = 1;
- ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (ret)
- goto fail;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
- if (ret == 0)
- root->in_radix = 1;
-
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
if (ret == -EEXIST) {
free_fs_root(root);
@@ -1601,10 +1613,6 @@ again:
}
goto fail;
}
-
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- WARN_ON(ret);
return root;
fail:
free_fs_root(root);
@@ -1676,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
+ int again;
do {
- int again = 0;
-
- if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
- down_read_trylock(&root->fs_info->sb->s_umount)) {
- if (mutex_trylock(&root->fs_info->cleaner_mutex)) {
- btrfs_run_delayed_iputs(root);
- again = btrfs_clean_one_deleted_snapshot(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- }
- btrfs_run_defrag_inodes(root->fs_info);
- up_read(&root->fs_info->sb->s_umount);
+ again = 0;
+
+ /* Make the cleaner go to sleep early. */
+ if (btrfs_need_cleaner_sleep(root))
+ goto sleep;
+
+ if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+ goto sleep;
+
+ /*
+ * Avoid the problem that we change the status of the fs
+ * during the above check and trylock.
+ */
+ if (btrfs_need_cleaner_sleep(root)) {
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ goto sleep;
}
+ btrfs_run_delayed_iputs(root);
+ again = btrfs_clean_one_deleted_snapshot(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /*
+ * The defragger has dealt with the R/O remount and umount,
+ * needn't do anything special here.
+ */
+ btrfs_run_defrag_inodes(root->fs_info);
+sleep:
if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
@@ -1724,7 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = get_seconds();
- if (!cur->blocked &&
+ if (cur->state < TRANS_STATE_BLOCKED &&
(now < cur->start_time || now - cur->start_time < 30)) {
spin_unlock(&root->fs_info->trans_lock);
delay = HZ * 5;
@@ -2034,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
list_del(&gang[0]->root_list);
if (gang[0]->in_radix) {
- btrfs_free_fs_root(fs_info, gang[0]);
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
free_extent_buffer(gang[0]->commit_root);
- kfree(gang[0]);
+ btrfs_put_fs_root(gang[0]);
}
}
@@ -2049,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
if (!ret)
break;
for (i = 0; i < ret; i++)
- btrfs_free_fs_root(fs_info, gang[i]);
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
@@ -2081,14 +2105,8 @@ int open_ctree(struct super_block *sb,
int backup_index = 0;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
- extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
- csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
- dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-
- if (!tree_root || !extent_root || !csum_root ||
- !chunk_root || !dev_root || !quota_root) {
+ if (!tree_root || !chunk_root) {
err = -ENOMEM;
goto fail;
}
@@ -2131,9 +2149,9 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
- spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2169,7 +2187,6 @@ int open_ctree(struct super_block *sb,
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
- fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
@@ -2180,8 +2197,8 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
- INIT_LIST_HEAD(&fs_info->ordered_extents);
- spin_lock_init(&fs_info->ordered_extent_lock);
+ INIT_LIST_HEAD(&fs_info->ordered_roots);
+ spin_lock_init(&fs_info->ordered_root_lock);
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
GFP_NOFS);
if (!fs_info->delayed_root) {
@@ -2274,6 +2291,7 @@ int open_ctree(struct super_block *sb,
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
mutex_init(&fs_info->qgroup_rescan_lock);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2638,33 +2656,44 @@ retry_root_backup:
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_EXTENT_TREE_OBJECTID, extent_root);
- if (ret)
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ extent_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(extent_root)) {
+ ret = PTR_ERR(extent_root);
goto recovery_tree_root;
+ }
extent_root->track_dirty = 1;
+ fs_info->extent_root = extent_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_DEV_TREE_OBJECTID, dev_root);
- if (ret)
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ dev_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(dev_root)) {
+ ret = PTR_ERR(dev_root);
goto recovery_tree_root;
+ }
dev_root->track_dirty = 1;
+ fs_info->dev_root = dev_root;
+ btrfs_init_devices_late(fs_info);
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_CSUM_TREE_OBJECTID, csum_root);
- if (ret)
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ csum_root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(csum_root)) {
+ ret = PTR_ERR(csum_root);
goto recovery_tree_root;
+ }
csum_root->track_dirty = 1;
+ fs_info->csum_root = csum_root;
- ret = find_and_setup_root(tree_root, fs_info,
- BTRFS_QUOTA_TREE_OBJECTID, quota_root);
- if (ret) {
- kfree(quota_root);
- quota_root = fs_info->quota_root = NULL;
- } else {
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ quota_root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(quota_root)) {
quota_root->track_dirty = 1;
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
+ fs_info->quota_root = quota_root;
}
fs_info->generation = generation;
@@ -2817,11 +2846,9 @@ retry_root_backup:
location.objectid = BTRFS_FS_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
+ location.offset = 0;
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (!fs_info->fs_root)
- goto fail_qgroup;
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
goto fail_qgroup;
@@ -2853,6 +2880,8 @@ retry_root_backup:
return ret;
}
+ btrfs_qgroup_rescan_resume(fs_info);
+
return 0;
fail_qgroup:
@@ -3258,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_RAID10)) {
num_tolerated_disk_barrier_failures = 1;
} else if (flags &
- BTRFS_BLOCK_GROUP_RAID5) {
+ BTRFS_BLOCK_GROUP_RAID6) {
num_tolerated_disk_barrier_failures = 2;
}
}
@@ -3366,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3397,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
kfree(root->name);
- kfree(root);
+ btrfs_put_fs_root(root);
+}
+
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+ free_fs_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3653,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
@@ -3661,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
@@ -3676,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
/*
* This will just short circuit the ordered completion stuff which will
* make sure the ordered extent gets properly cleaned up.
*/
- list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+ list_for_each_entry(ordered, &root->ordered_extents,
root_extent_list)
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ list_del_init(&root->ordered_root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ cond_resched_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
}
int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3706,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->root)) != NULL) {
struct btrfs_delayed_ref_head *head = NULL;
+ bool pin_bytes = false;
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
atomic_set(&ref->refs, 1);
@@ -3726,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
}
if (head->must_insert_reserved)
- btrfs_pin_extent(root, ref->bytenr,
- ref->num_bytes, 1);
+ pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
@@ -3738,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (head)
- mutex_unlock(&head->mutex);
spin_unlock(&delayed_refs->lock);
+ if (head) {
+ if (pin_bytes)
+ btrfs_pin_extent(root, ref->bytenr,
+ ref->num_bytes, 1);
+ mutex_unlock(&head->mutex);
+ }
btrfs_put_delayed_ref(ref);
cond_resched();
@@ -3777,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+ delalloc_inodes);
list_del_init(&btrfs_inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&btrfs_inode->runtime_flags);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
- spin_lock(&root->fs_info->delalloc_lock);
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ list_del_init(&root->delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ btrfs_destroy_delalloc_inodes(root);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
}
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3878,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
cur_trans->dirty_pages.dirty_bytes);
- /* FIXME: cleanup wait for commit */
- cur_trans->in_commit = 1;
- cur_trans->blocked = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(cur_trans);
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- cur_trans->commit_done = 1;
- wake_up(&cur_trans->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
@@ -3899,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ cur_trans->state =TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+
/*
memset(cur_trans, 0, sizeof(*cur_trans));
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3914,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
list_splice_init(&root->fs_info->trans_list, &list);
- root->fs_info->trans_no_join = 1;
+ root->fs_info->running_transaction = NULL;
spin_unlock(&root->fs_info->trans_lock);
while (!list_empty(&list)) {
@@ -3922,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_ordered_operations(t, root);
- btrfs_destroy_ordered_extents(root);
+ btrfs_destroy_all_ordered_extents(root->fs_info);
btrfs_destroy_delayed_refs(t, root);
- /* FIXME: cleanup wait for commit */
- t->in_commit = 1;
- t->blocked = 1;
+ /*
+ * FIXME: cleanup wait for commit
+ * We needn't acquire the lock here, because we are during
+ * the umount, there is no other task which will change it.
+ */
+ t->state = TRANS_STATE_COMMIT_START;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
btrfs_evict_pending_snapshots(t);
- t->blocked = 0;
+ t->state = TRANS_STATE_UNBLOCKED;
smp_mb();
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
- t->commit_done = 1;
- smp_mb();
- if (waitqueue_active(&t->commit_wait))
- wake_up(&t->commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
- btrfs_destroy_delalloc_inodes(root);
-
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
+ btrfs_destroy_all_delalloc_inodes(root->fs_info);
btrfs_destroy_marked_extents(root, &t->dirty_pages,
EXTENT_DIRTY);
@@ -3960,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
+ t->state = TRANS_STATE_COMPLETED;
+ smp_mb();
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+
atomic_set(&t->use_count, 0);
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);
}
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 0;
- spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
- struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+ if (atomic_inc_not_zero(&root->refs))
+ return root;
+ return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->refs))
+ kfree(root);
+}
+
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
goto fail;
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- err = -ENOENT;
- goto fail;
- }
-
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
return 0;
}
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+ u64 num_bytes;
+
+ num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+ sizeof(struct btrfs_extent_inline_ref));
+ if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+ /*
+ * We don't ever fill up leaves all the way so multiply by 2 just to be
+ * closer to what we're really going to want to ouse.
+ */
+ return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *global_rsv;
+ u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ u64 num_bytes;
+ int ret = 0;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_heads = heads_to_leaves(root, num_heads);
+ if (num_heads > 1)
+ num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes <<= 1;
+ global_rsv = &root->fs_info->global_block_rsv;
+
+ /*
+ * If we can't allocate any more chunks lets make sure we have _lots_ of
+ * wiggle room since running delayed refs can create more delayed refs.
+ */
+ if (global_rsv->space_info->full)
+ num_bytes <<= 1;
+
+ spin_lock(&global_rsv->lock);
+ if (global_rsv->reserved <= num_bytes)
+ ret = 1;
+ spin_unlock(&global_rsv->lock);
+ return ret;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
if (old) {
DEFINE_WAIT(__wait);
- if (delayed_refs->num_entries < 16348)
+ if (delayed_refs->flushing ||
+ !btrfs_should_throttle_delayed_refs(trans, root))
return 0;
prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
while (1) {
if (!(run_all || run_most) &&
- delayed_refs->num_heads_ready < 64)
+ !btrfs_should_throttle_delayed_refs(trans, root))
break;
/*
@@ -2629,6 +2676,7 @@ again:
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
atomic_dec(&delayed_refs->procs_running_refs);
+ wake_up(&delayed_refs->wait);
return ret;
}
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info *found;
int i;
int factor;
+ int ret;
if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
}
/*
- * If we have less pinned bytes than we want to allocate then
- * don't bother committing the transaction, it won't help us.
+ * If we don't have enough pinned space to deal with this
+ * allocation don't bother committing the transaction.
*/
- if (data_sinfo->bytes_pinned < bytes)
+ if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+ bytes) < 0)
committed = 1;
spin_unlock(&data_sinfo->lock);
@@ -3577,6 +3633,7 @@ commit_trans:
if (!committed &&
!atomic_read(&root->fs_info->open_ioctl_trans)) {
committed = 1;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
+ WARN_ON(data_sinfo->bytes_may_use < bytes);
data_sinfo->bytes_may_use -= bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
unsigned long nr_pages)
{
struct super_block *sb = root->fs_info->sb;
- int started;
- /* If we can not start writeback, just sync all the delalloc file. */
- started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
- if (!started) {
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
/*
* We needn't worry the filesystem going from r/w to r/o though
* we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_inodes(root, 0);
+ btrfs_start_all_delalloc_inodes(root->fs_info, 0);
if (!current->journal_info)
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
}
}
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
if (delalloc_bytes == 0) {
if (trans)
return;
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
return;
}
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0);
+ btrfs_wait_all_ordered_extents(root->fs_info, 0);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
/* See if there is enough pinned space to make this reservation */
spin_lock(&space_info->lock);
- if (space_info->bytes_pinned >= bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes) >= 0) {
spin_unlock(&space_info->lock);
goto commit;
}
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
- if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
spin_unlock(&space_info->lock);
return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ block_rsv_add_bytes(dest, num_bytes, 1);
+ return 0;
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
int factor;
/* block accounting for super block */
- spin_lock(&info->delalloc_lock);
+ spin_lock(&info->delalloc_root_lock);
old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
btrfs_set_super_bytes_used(info->super_copy, old_val);
- spin_unlock(&info->delalloc_lock);
+ spin_unlock(&info->delalloc_root_lock);
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
return ret;
}
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, start);
+ if (!block_group)
+ return -EINVAL;
+
+ cache_block_group(block_group, 0);
+ caching_ctl = get_caching_control(block_group);
+
+ if (!caching_ctl) {
+ /* Logic error */
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ if (ret)
+ goto out_lock;
+
+ num_bytes = (start + num_bytes) -
+ caching_ctl->progress;
+ start = caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ }
+out_lock:
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+ struct extent_buffer *eb)
+{
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key key;
+ int found_type;
+ int i;
+
+ if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+ if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ continue;
+ key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ __exclude_logged_extent(log, key.objectid, key.offset);
+ }
+
+ return 0;
+}
+
/**
* btrfs_update_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
+ struct btrfs_space_info *space_info;
down_write(&fs_info->extent_commit_sem);
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
up_write(&fs_info->extent_commit_sem);
+ list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+ percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
update_global_block_rsv(fs_info);
}
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
return 0;
}
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ BUG_ON(!space_info); /* Logic bug */
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+ root_objectid);
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_block_group_cache *cache = NULL;
+ int pin = 1;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ pin = 0;
}
out:
+ if (pin)
+ add_pinned_bytes(root->fs_info, buf->len,
+ btrfs_header_level(buf),
+ root->root_key.objectid);
+
/*
* Deleting the buffer, clear the corrupt flag since it doesn't matter
* anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+ add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
/*
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
- struct btrfs_caching_control *caching_ctl;
- u64 start = ins->objectid;
- u64 num_bytes = ins->offset;
-
- block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, 0);
- caching_ctl = get_caching_control(block_group);
-
- if (!caching_ctl) {
- BUG_ON(!block_group_cache_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- if (ret)
- goto out;
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = add_excluded_extent(root, start, num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
- start = caching_ctl->progress;
- num_bytes = ins->objectid + ins->offset -
- caching_ctl->progress;
- ret = add_excluded_extent(root, start, num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- put_caching_control(caching_ctl);
+ /*
+ * Mixed block groups will exclude before processing the log so we only
+ * need to do the exlude dance if this fs isn't mixed.
+ */
+ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
if (ret)
- goto out;
+ return ret;
}
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ if (!block_group)
+ return -EINVAL;
+
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
-out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -7298,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int err = 0;
int ret;
int level;
+ bool root_dropped = false;
path = btrfs_alloc_path();
if (!path) {
@@ -7355,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
while (1) {
btrfs_tree_lock(path->nodes[level]);
btrfs_set_lock_blocking(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
path->nodes[level]->start,
@@ -7370,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
break;
btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
WARN_ON(wc->refs[level] != 1);
level--;
}
@@ -7384,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
- if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
- pr_debug("btrfs: drop snapshot early exit\n");
- err = -EAGAIN;
- goto out_end_trans;
- }
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
@@ -7416,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
BUG_ON(wc->level == 0);
- if (btrfs_should_end_transaction(trans, tree_root)) {
+ if (btrfs_should_end_transaction(trans, tree_root) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(root))) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
@@ -7427,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
btrfs_end_transaction_throttle(trans, tree_root);
+ if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ pr_debug("btrfs: drop snapshot early exit\n");
+ err = -EAGAIN;
+ goto out_free;
+ }
+
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -7447,8 +7620,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
- NULL, NULL);
+ ret = btrfs_find_root(tree_root, &root->root_key, path,
+ NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, tree_root, ret);
err = ret;
@@ -7465,18 +7638,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (root->in_radix) {
- btrfs_free_fs_root(tree_root->fs_info, root);
+ btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
- kfree(root);
+ btrfs_put_fs_root(root);
}
+ root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (root_dropped == false)
+ btrfs_add_dead_root(root);
if (err)
btrfs_std_error(root->fs_info, err);
return err;
@@ -7782,6 +7965,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
+ struct btrfs_trans_handle *trans;
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
@@ -7868,6 +8052,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
do_div(min_free, dev_min);
}
+ /* We need to do this so that we can look at pending chunks */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 dev_offset;
@@ -7878,7 +8069,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free &&
!device->is_tgtdev_for_dev_replace) {
- ret = find_free_dev_extent(device, min_free,
+ ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7890,6 +8081,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
}
}
mutex_unlock(&root->fs_info->chunk_mutex);
+ btrfs_end_transaction(trans, root);
out:
btrfs_put_block_group(block_group);
return ret;
@@ -8032,6 +8224,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
dump_space_info(space_info, 0, 0);
}
}
+ percpu_counter_destroy(&space_info->total_bytes_pinned);
list_del(&space_info->list);
kfree(space_info);
}
@@ -8254,6 +8447,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
sizeof(item));
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ ret = btrfs_finish_chunk_alloc(trans, extent_root,
+ key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
}
}
@@ -8591,8 +8788,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
- if (!ret)
- wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ ret = wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
}
ret = btrfs_trim_block_group(cache,
&group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
kmem_cache_free(extent_buffer_cache, eb);
}
}
+
+#define btrfs_debug_check_extent_io_range(inode, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct inode *inode, u64 start, u64 end)
+{
+ u64 isize = i_size_read(inode);
+
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ printk_ratelimited(KERN_DEBUG
+ "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ caller,
+ (unsigned long long)btrfs_ino(inode),
+ (unsigned long long)isize,
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
#define btrfs_leak_debug_del(entry) do {} while (0)
#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err;
int clear = 0;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
if (delete)
bits |= ~EXTENT_CTLBITS;
bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct rb_node *node;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
spin_lock(&tree->lock);
again:
while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
"mirror=%lu\n", (u64)bio->bi_sector, err,
io_bio->mirror_num);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
+ tree = &BTRFS_I(inode)->io_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
if (uptodate) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+
+ /* Zero out the end if this page straddles i_size */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && offset)
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2957,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
- sizeof(struct btrfs_sector_sum) * \
- (r)->sectorsize - (r)->sectorsize)
+ sizeof(u32) * (r)->sectorsize)
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
- MAX_ORDERED_SUM_BYTES(root));
+ MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
- GFP_NOFS);
+ GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
- sector_sum = sums->sums;
sums->bytenr = start;
- sums->len = size;
+ sums->len = (int)size;
offset = (start - key.offset) >>
root->fs_info->sb->s_blocksize_bits;
offset *= csum_size;
+ size >>= root->fs_info->sb->s_blocksize_bits;
- while (size > 0) {
- read_extent_buffer(path->nodes[0],
- &sector_sum->sum,
- ((unsigned long)item) +
- offset, csum_size);
- sector_sum->bytenr = start;
-
- size -= root->sectorsize;
- start += root->sectorsize;
- offset += csum_size;
- sector_sum++;
- }
+ read_extent_buffer(path->nodes[0],
+ sums->sums,
+ ((unsigned long)item) + offset,
+ csum_size * size);
+
+ start += root->sectorsize * size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
+ int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
- u64 disk_bytenr;
WARN_ON(bio->bi_vcnt <= 0);
sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
if (!sums)
return -ENOMEM;
- sector_sum = sums->sums;
- disk_bytenr = (u64)bio->bi_sector << 9;
sums->len = bio->bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = (u64)bio->bi_sector << 9;
+ index = 0;
while (bio_index < bio->bi_vcnt) {
if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
BUG_ON(!sums); /* -ENOMEM */
- sector_sum = sums->sums;
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ordered->start;
+ sums->bytenr = ((u64)bio->bi_sector << 9) +
+ total_bytes;
+ index = 0;
}
data = kmap_atomic(bvec->bv_page);
- sector_sum->sum = ~(u32)0;
- sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset,
- sector_sum->sum,
- bvec->bv_len);
+ sums->sums[index] = ~(u32)0;
+ sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+ sums->sums[index],
+ bvec->bv_len);
kunmap_atomic(data);
- btrfs_csum_final(sector_sum->sum,
- (char *)&sector_sum->sum);
- sector_sum->bytenr = disk_bytenr;
+ btrfs_csum_final(sums->sums[index],
+ (char *)(sums->sums + index));
- sector_sum++;
bio_index++;
+ index++;
total_bytes += bvec->bv_len;
this_sum_bytes += bvec->bv_len;
- disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
bvec++;
}
@@ -672,62 +661,46 @@ out:
return ret;
}
-static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
- struct btrfs_sector_sum *sector_sum,
- u64 total_bytes, u64 sectorsize)
-{
- u64 tmp = sectorsize;
- u64 next_sector = sector_sum->bytenr;
- struct btrfs_sector_sum *next = sector_sum + 1;
-
- while ((tmp + total_bytes) < sums->len) {
- if (next_sector + sectorsize != next->bytenr)
- break;
- tmp += sectorsize;
- next_sector = next->bytenr;
- next++;
- }
- return tmp;
-}
-
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
{
- u64 bytenr;
- int ret;
struct btrfs_key file_key;
struct btrfs_key found_key;
- u64 next_offset;
- u64 total_bytes = 0;
- int found_next;
struct btrfs_path *path;
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
+ u64 next_offset;
+ u64 total_bytes = 0;
u64 csum_offset;
- struct btrfs_sector_sum *sector_sum;
+ u64 bytenr;
u32 nritems;
u32 ins_size;
+ int index = 0;
+ int found_next;
+ int ret;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
- sector_sum = sums->sums;
again:
next_offset = (u64)-1;
found_next = 0;
+ bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = sector_sum->bytenr;
- bytenr = sector_sum->bytenr;
+ file_key.offset = bytenr;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
- item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+ item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
- leaf = path->nodes[0];
ret = 0;
+ leaf = path->nodes[0];
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
free_space = btrfs_leaf_free_space(root, leaf) -
sizeof(struct btrfs_item) - csum_size;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
WARN_ON(tmp < 1);
@@ -822,6 +794,7 @@ again:
diff *= csum_size;
btrfs_extend_item(root, path, diff);
+ ret = 0;
goto csum;
}
@@ -831,8 +804,7 @@ insert:
if (found_next) {
u64 tmp;
- tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
- root->sectorsize);
+ tmp = sums->len - total_bytes;
tmp >>= root->fs_info->sb->s_blocksize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
WARN_ON(1);
goto fail_unlock;
}
-csum:
leaf = path->nodes[0];
+csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- ret = 0;
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+ btrfs_item_size_nr(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
- item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
- write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
- total_bytes += root->sectorsize;
- sector_sum++;
- if (total_bytes < sums->len) {
- item = (struct btrfs_csum_item *)((char *)item +
- csum_size);
- if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
- sector_sum->bytenr) {
- bytenr = sector_sum->bytenr;
- goto next_sector;
- }
- }
+ ins_size = (u32)(sums->len - total_bytes) >>
+ root->fs_info->sb->s_blocksize_bits;
+ ins_size *= csum_size;
+ ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+ ins_size);
+ write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+ ins_size);
+
+ ins_size /= csum_size;
+ total_bytes += ins_size * root->sectorsize;
+ index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a58b63..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(inode_root);
goto cleanup;
}
- if (btrfs_root_refs(&inode_root->root_item) == 0) {
- ret = -ENOENT;
- goto cleanup;
- }
key.objectid = defrag->ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
}
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+ while (1) {
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered) {
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ return PTR_ERR(trans);
+ }
+
+ num_bytes = lockend - lockstart + 1;
+ ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+ NULL);
+ btrfs_end_transaction(trans, root);
+ if (ret <= 0) {
+ ret = 0;
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ NULL, GFP_NOFS);
+ *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ }
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+ return ret;
+}
+
static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct iov_iter *i,
loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
+ u64 release_bytes = 0;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool only_release_metadata = false;
bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
offset);
size_t num_pages = (write_bytes + offset +
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- ret = btrfs_delalloc_reserve_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ if (ret == -ENOSPC &&
+ (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC))) {
+ ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret > 0) {
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = (write_bytes + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = 0;
+ } else {
+ ret = -ENOSPC;
+ }
+ }
+
if (ret)
break;
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode,
+ reserve_bytes);
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, write_bytes,
force_page_uptodate);
- if (ret) {
- btrfs_delalloc_release_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
break;
- }
copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* managed to copy.
*/
if (num_pages > dirty_pages) {
+ release_bytes = (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT;
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- btrfs_delalloc_release_space(inode,
- (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode,
+ release_bytes);
+ else
+ btrfs_delalloc_release_space(inode,
+ release_bytes);
}
+ release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
if (copied > 0) {
ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied,
NULL);
if (ret) {
- btrfs_delalloc_release_space(inode,
- dirty_pages << PAGE_CACHE_SHIFT);
btrfs_drop_pages(pages, num_pages);
break;
}
}
+ release_bytes = 0;
btrfs_drop_pages(pages, num_pages);
+ if (only_release_metadata && copied > 0) {
+ u64 lockstart = round_down(pos, root->sectorsize);
+ u64 lockend = lockstart +
+ (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+ set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_NORESERVE, NULL,
+ NULL, GFP_NOFS);
+ only_release_metadata = false;
+ }
+
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
kfree(pages);
+ if (release_bytes) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, release_bytes);
+ else
+ btrfs_delalloc_release_space(inode, release_bytes);
+ }
+
return num_written ? num_written : ret;
}
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out_reserve_fail;
}
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start);
if (ret)
goto out;
+ } else {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
}
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e53009657f0e..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
else
ret = 0;
spin_unlock(&rsv->lock);
- return 0;
+ return ret;
}
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
return 0;
}
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
+
/*
* This test just does basic sanity checking, making sure we can add an exten
* entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
{
int ret = 0;
- printk(KERN_ERR "Running extent only tests\n");
+ test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error adding initial extents %d\n", ret);
+ test_msg("Error adding initial extents %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing extent %d\n", ret);
+ test_msg("Error removing extent %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Full remove left some lingering space\n");
+ test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error adding half extent %d\n", ret);
+ test_msg("Error adding half extent %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing tail end %d\n", ret);
+ test_msg("Error removing tail end %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing front end %d\n", ret);
+ test_msg("Error removing front end %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
if (ret) {
- printk(KERN_ERR "Error removing middle peice %d\n", ret);
+ test_msg("Error removing middle piece %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Still have space at the front\n");
+ test_msg("Still have space at the front\n");
return -1;
}
if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
- printk(KERN_ERR "Still have space in the middle\n");
+ test_msg("Still have space in the middle\n");
return -1;
}
if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Still have space at the end\n");
+ test_msg("Still have space at the end\n");
return -1;
}
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
u64 next_bitmap_offset;
int ret;
- printk(KERN_ERR "Running bitmap only tests\n");
+ test_msg("Running bitmap only tests\n");
ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret);
+ test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing bitmap full range %d\n", ret);
+ test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Left some space in bitmap\n");
+ test_msg("Left some space in bitmap\n");
return -1;
}
ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret);
+ test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret);
+ test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
}
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
ret = add_free_space_entry(cache, next_bitmap_offset -
(2 * 1024 * 1024), 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add space that straddles two bitmaps"
- " %d\n", ret);
+ test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ ret);
return ret;
}
ret = btrfs_remove_free_space(cache, next_bitmap_offset -
(1 * 1024 * 1024), 2 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+ test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
2 * 1024 * 1024)) {
- printk(KERN_ERR "Left some space when removing overlapping\n");
+ test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
int ret;
- printk(KERN_ERR "Running bitmap and extent tests\n");
+ test_msg("Running bitmap and extent tests\n");
/*
* First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret);
+ test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove extent entry %d\n", ret);
+ test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
if (check_exists(cache, 0, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Left remnants after our remove\n");
+ test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret);
+ test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret);
+ test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
- printk(KERN_ERR "Left remnants in the bitmap\n");
+ test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret);
+ test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+ test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
- printk(KERN_ERR "Left over peices after removing "
- "overlapping\n");
+ test_msg("Left over peices after removing overlapping\n");
return -1;
}
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
/* Now with the extent entry offset into the bitmap */
ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret);
+ test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret);
+ test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Problem removing overlapping space %d\n", ret);
+ test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
- printk(KERN_ERR "Left something behind when removing space");
+ test_msg("Left something behind when removing space");
return -1;
}
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
4 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add bitmap %d\n", ret);
+ test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
5 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
5 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Failed to free our space %d\n", ret);
+ test_msg("Failed to free our space %d\n", ret);
return ret;
}
if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
5 * 1024 * 1024)) {
- printk(KERN_ERR "Left stuff over\n");
+ test_msg("Left stuff over\n");
return -1;
}
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
*/
ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
if (ret) {
- printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret);
+ test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
if (ret) {
- printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+ test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
if (ret) {
- printk(KERN_ERR "Error removing bitmap and extent "
- "overlapping %d\n", ret);
+ test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
}
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- printk(KERN_ERR "Running btrfs free space cache tests\n");
+ test_msg("Running btrfs free space cache tests\n");
cache = init_test_block_group();
if (!cache) {
- printk(KERN_ERR "Couldn't run the tests\n");
+ test_msg("Couldn't run the tests\n");
return;
}
@@ -3487,6 +3487,9 @@ out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
kfree(cache);
- printk(KERN_ERR "Free space cache tests finished\n");
+ test_msg("Free space cache tests finished\n");
}
-#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_free_space_cache(void);
-#endif
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a46b656d08de..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
#include <linux/mount.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -57,6 +58,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "backref.h"
+#include "hash.h"
struct btrfs_iget_args {
u64 ino;
@@ -701,8 +703,12 @@ retry:
async_extent->nr_pages = 0;
async_extent->pages = NULL;
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
goto retry;
+ }
goto out_free;
}
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(!list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root,
+ &root->fs_info->delalloc_roots);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes--;
+ if (!root->nr_delalloc_inodes) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(list_empty(&root->delalloc_root));
+ list_del_init(&root->delalloc_root);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_delalloc_release_metadata(inode, len);
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list)
+ && do_list && !(state->state & EXTENT_NORESERVE))
btrfs_free_reserved_data_space(inode, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
BTRFS_I(inode)->delalloc_bytes -= len;
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_del_init(&BTRFS_I(inode)->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
- }
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_del_delalloc_inode(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return 0;
return PTR_ERR(root);
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- /* parse ENOENT to 0 */
- return 0;
- }
/* step 2: get inode */
key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* 1 for the orphan item deletion. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
+ iput(inode);
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_orphan_add(trans, inode);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret) {
+ iput(inode);
goto out;
+ }
ret = btrfs_truncate(inode);
if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
+ static u64 xattr_access = 0;
+ static u64 xattr_default = 0;
int scanned = 0;
+ if (!xattr_access) {
+ xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS));
+ xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT));
+ }
+
slot++;
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 0;
/* we found an xattr, assume we've got an acl */
- if (found_key.type == BTRFS_XATTR_ITEM_KEY)
- return 1;
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (found_key.offset == xattr_access ||
+ found_key.offset == xattr_default)
+ return 1;
+ }
/*
* we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
return ret;
}
-
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
- struct btrfs_path *path)
-{
- struct extent_buffer *eb;
- int level;
- u64 refs = 1;
-
- for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
- int ret;
-
- if (!path->nodes[level])
- break;
- eb = path->nodes[level];
- if (!btrfs_block_can_be_shared(root, eb))
- continue;
- ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
- &refs, NULL);
- if (refs > 1)
- return 1;
- }
- return 0;
-}
/*
* helper to start transaction for unlink and rmdir.
*
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
*/
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
- struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_path *path;
- struct btrfs_dir_item *di;
- struct inode *inode = dentry->d_inode;
- u64 index;
- int check_link = 1;
- int err = -ENOSPC;
int ret;
- u64 ino = btrfs_ino(inode);
- u64 dir_ino = btrfs_ino(dir);
/*
* 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
- if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
- return ERR_PTR(-ENOSPC);
-
- /* check if there is someone else holds reference */
- if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
- return ERR_PTR(-ENOSPC);
-
- if (atomic_read(&inode->i_count) > 2)
- return ERR_PTR(-ENOSPC);
-
- if (xchg(&root->fs_info->enospc_unlink, 1))
- return ERR_PTR(-ENOSPC);
-
- path = btrfs_alloc_path();
- if (!path) {
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(-ENOMEM);
- }
+ if (PTR_ERR(trans) == -ENOSPC) {
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
- /* 1 for the orphan item */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- root->fs_info->enospc_unlink = 0;
- return trans;
- }
-
- path->skip_locking = 1;
- path->search_commit_root = 1;
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(dir)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
-
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(inode)->location, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret == 0) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- check_link = 0;
- }
- btrfs_release_path(path);
-
- if (ret == 0 && S_ISREG(inode->i_mode)) {
- ret = btrfs_lookup_file_extent(trans, root, path,
- ino, (u64)-1, 0);
- if (ret < 0) {
- err = ret;
- goto out;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, 5);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
}
- BUG_ON(ret == 0); /* Corruption */
- if (check_path_shared(root, path))
- goto out;
- btrfs_release_path(path);
- }
-
- if (!check_link) {
- err = 0;
- goto out;
- }
-
- di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- if (di) {
- if (check_path_shared(root, path))
- goto out;
- } else {
- err = 0;
- goto out;
- }
- btrfs_release_path(path);
-
- ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
- dentry->d_name.len, ino, dir_ino, 0,
- &index);
- if (ret) {
- err = ret;
- goto out;
- }
-
- if (check_path_shared(root, path))
- goto out;
-
- btrfs_release_path(path);
-
- /*
- * This is a commit root search, if we can lookup inode item and other
- * relative items in the commit root, it means the transaction of
- * dir/file creation has been committed, and the dir index item that we
- * delay to insert has also been inserted into the commit root. So
- * we needn't worry about the delayed insertion of the dir index item
- * here.
- */
- di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
- dentry->d_name.name, dentry->d_name.len, 0);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto out;
- }
- BUG_ON(ret == -ENOENT);
- if (check_path_shared(root, path))
- goto out;
-
- err = 0;
-out:
- btrfs_free_path(path);
- /* Migrate the orphan reservation over */
- if (!err)
- err = btrfs_block_rsv_migrate(trans->block_rsv,
- &root->fs_info->global_block_rsv,
- trans->bytes_reserved);
-
- if (err) {
- btrfs_end_transaction(trans, root);
- root->fs_info->enospc_unlink = 0;
- return ERR_PTR(err);
- }
-
- trans->block_rsv = &root->fs_info->global_block_rsv;
- return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
- btrfs_block_rsv_release(root, trans->block_rsv,
- trans->bytes_reserved);
trans->block_rsv = &root->fs_info->trans_block_rsv;
- BUG_ON(!root->fs_info->enospc_unlink);
- root->fs_info->enospc_unlink = 0;
+ trans->bytes_reserved = num_bytes;
}
- btrfs_end_transaction(trans, root);
+ return trans;
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int ret;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return ret;
}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
return -EPERM;
- trans = __unlink_start_trans(dir, dentry);
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (!err)
btrfs_i_size_write(inode, 0);
out:
- __unlink_end_trans(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
u64 hole_size;
int err = 0;
+ /*
+ * If our size started in the middle of a page we need to zero out the
+ * rest of the page before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
if (size <= hole_start)
return 0;
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- err = -ENOENT;
- goto out;
- }
-
*sub_root = new_root;
location->objectid = btrfs_root_dirid(&new_root->root_item);
location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (!(inode->i_sb->s_flags & MS_RDONLY))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&root->fs_info->cleanup_work_sem);
- if (ret)
+ if (ret) {
+ iput(inode);
inode = ERR_PTR(ret);
+ }
}
return inode;
@@ -6501,10 +6380,10 @@ out:
* returns 1 when the nocow is safe, < 1 on error, 0 if the
* block must be cow'd
*/
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes)
{
struct btrfs_path *path;
int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
u64 num_bytes;
int slot;
int found_type;
-
+ bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
/* not a regular extent, must cow */
goto out;
}
+
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (disk_bytenr == 0)
+ goto out;
+
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
backref_offset = btrfs_file_extent_offset(leaf, fi);
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (orig_start) {
+ *orig_start = key.offset - backref_offset;
+ *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ }
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end < offset + *len) {
- /* extent doesn't include our full range, must cow */
- goto out;
- }
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (IS_ERR(trans))
goto must_cow;
- if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1) {
+ if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1) {
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
- struct bio_vec *bvec = dio_bio->bi_io_vec;
struct bio *io_bio;
int skip_sum;
int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
}
dip->private = dio_bio->bi_private;
- io_bio->bi_private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
-
- dip->bytes = 0;
- do {
- dip->bytes += bvec->bv_len;
- bvec++;
- } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
-
+ dip->bytes = dio_bio->bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
io_bio->bi_private = dip;
dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
atomic_inc(&inode->i_dio_count);
smp_mb__after_atomic_inc();
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * call btrfs_wait_ordered_range to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ count = iov_length(iov, nr_segs);
+ btrfs_wait_ordered_range(inode, offset, count);
+
if (rw & WRITE) {
- count = iov_length(iov, nr_segs);
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
@@ -7493,7 +7382,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
}
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
@@ -7693,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret;
+ int ret = 0;
int err = 0;
struct btrfs_trans_handle *trans;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
- if (ret)
- return ret;
-
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
@@ -7960,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
*/
smp_mb();
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8332,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8341,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
struct list_head splice;
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
- list_del_init(&binode->delalloc_inodes);
-
+ list_move_tail(&binode->delalloc_inodes,
+ &root->delalloc_inodes);
inode = igrab(&binode->vfs_inode);
if (!inode) {
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &binode->runtime_flags);
+ cond_resched_lock(&root->delalloc_lock);
continue;
}
-
- list_add_tail(&binode->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
if (unlikely(!work)) {
@@ -8376,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
&work->work);
cond_resched();
- spin_lock(&root->fs_info->delalloc_lock);
+ spin_lock(&root->delalloc_lock);
}
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_unlock(&root->delalloc_lock);
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+ return 0;
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->delalloc_lock);
+ list_splice_tail(&splice, &root->delalloc_inodes);
+ spin_unlock(&root->delalloc_lock);
+ }
+ return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+ int ret;
- /* the filemap_flush will queue IO into the worker threads, but
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ ret = __start_delalloc_inodes(root, delay_iput);
+ /*
+ * the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
*/
@@ -8397,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
- return 0;
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
+ return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = __start_delalloc_inodes(root, delay_iput);
+ btrfs_put_fs_root(root);
+ if (ret)
+ goto out;
+
+ spin_lock(&fs_info->delalloc_root_lock);
}
+ spin_unlock(&fs_info->delalloc_root_lock);
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&fs_info->async_submit_draining);
+ return 0;
+out:
if (!list_empty_careful(&splice)) {
- spin_lock(&root->fs_info->delalloc_lock);
- list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
- spin_unlock(&root->fs_info->delalloc_lock);
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_tail(&splice, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd7e96c73cb7..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!root->ref_cows)
return -EINVAL;
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+
+ btrfs_wait_ordered_extents(root, 0);
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot)
return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- mnt_drop_write_file(file);
- return -EINVAL;
- }
-
- mutex_lock(&root->fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(root, vol_args->name);
- kfree(vol_args);
-out:
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name);
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+ kfree(vol_args);
mnt_drop_write_file(file);
return ret;
}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
+ int same_inode = 0;
/*
* TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
ret = -EINVAL;
if (src == inode)
- goto out_fput;
+ same_inode = 1;
/* the src must be open for reading */
if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
path->reada = 2;
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ }
} else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock(&src->i_mutex);
}
/* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
!IS_ALIGNED(destoff, bs))
goto out_unlock;
+ /* verify if ranges are overlapped within the same file */
+ if (same_inode) {
+ if (destoff + len > off && destoff < off + len)
+ goto out_unlock;
+ }
+
if (destoff > inode->i_size) {
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
@@ -2846,7 +2863,8 @@ out:
unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
out_unlock:
mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
+ if (!same_inode)
+ mutex_unlock(&inode->i_mutex);
vfree(buf);
btrfs_free_path(path);
out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
goto out;
}
- if (btrfs_root_refs(&new_root->root_item) == 0) {
- ret = -ENOENT;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
break;
}
- if (copy_to_user(arg, sa, sizeof(*sa)))
- ret = -EFAULT;
-
err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
if (err && !ret)
ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
return ret;
}
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+ return btrfs_ioctl_quota_rescan_wait(file, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(root, argp);
case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
struct workspace {
void *mem;
- void *buf; /* where compressed data goes */
- void *cbuf; /* where decompressed data goes */
+ void *buf; /* where decompressed data goes */
+ void *cbuf; /* where compressed data goes */
struct list_head list;
};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
+#include "disk-io.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio, int compress_type)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
ordered_data_tree_panic(inode, -EEXIST, file_offset);
spin_unlock_irq(&tree->lock);
- spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
- &BTRFS_I(inode)->root->fs_info->ordered_extents);
- spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ &root->ordered_extents);
+ root->nr_ordered_extents++;
+ if (root->nr_ordered_extents == 1) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(!list_empty(&root->ordered_root));
+ list_add_tail(&root->ordered_root,
+ &root->fs_info->ordered_roots);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
return 0;
}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
spin_unlock_irq(&tree->lock);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+ root->nr_ordered_extents--;
trace_btrfs_ordered_extent_remove(inode, entry);
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (!root->nr_ordered_extents) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(list_empty(&root->ordered_root));
+ list_del_init(&root->ordered_root);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
}
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
{
struct list_head splice, works;
- struct list_head *cur;
struct btrfs_ordered_extent *ordered, *next;
struct inode *inode;
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
- list_splice_init(&root->fs_info->ordered_extents, &splice);
+ spin_lock(&root->ordered_extent_lock);
+ list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice)) {
- cur = splice.next;
- ordered = list_entry(cur, struct btrfs_ordered_extent,
- root_extent_list);
- list_del_init(&ordered->root_extent_list);
- atomic_inc(&ordered->refs);
-
+ ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+ root_extent_list);
+ list_move_tail(&ordered->root_extent_list,
+ &root->ordered_extents);
/*
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(ordered->inode);
+ if (!inode) {
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ atomic_inc(&ordered->refs);
+ spin_unlock(&root->ordered_extent_lock);
- if (inode) {
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
- list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
- } else {
- btrfs_put_ordered_extent(ordered);
- }
+ ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->ordered_extent_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->ordered_extent_lock);
list_for_each_entry_safe(ordered, next, &works, work_list) {
list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ btrfs_wait_ordered_extents(root, delay_iput);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
+}
+
/*
* this is used during transaction commit to write all the inodes
* added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&works);
mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
if (!wait)
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
list_splice_tail(&splice,
&cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
ret = -ENOMEM;
goto out;
}
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
&work->work);
cond_resched();
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len)
{
struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_sector_sum *sector_sums;
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
i = (disk_bytenr - ordered_sum->bytenr) >>
inode->i_sb->s_blocksize_bits;
- sector_sums = ordered_sum->sums + i;
num_sectors = ordered_sum->len >>
inode->i_sb->s_blocksize_bits;
- for (; i < num_sectors; i++) {
- if (sector_sums[i].bytenr == disk_bytenr) {
- sum[index] = sector_sums[i].sum;
- index++;
- if (index == len)
- goto out;
- disk_bytenr += sectorsize;
- }
- }
+ num_sectors = min_t(int, len - index, num_sectors - i);
+ memcpy(sum + index, ordered_sum->sums + i,
+ num_sectors);
+
+ index += (int)num_sectors;
+ if (index == len)
+ goto out;
+ disk_bytenr += num_sectors * sectorsize;
}
}
out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
if (last_mod < root->fs_info->last_trans_committed)
return;
- spin_lock(&root->fs_info->ordered_extent_lock);
+ spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
}
- spin_unlock(&root->fs_info->ordered_extent_lock);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
struct rb_node *last;
};
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
- /* bytenr on disk */
- u64 bytenr;
- u32 sum;
-};
-
struct btrfs_ordered_sum {
/* bytenr is the start of this extent on disk */
u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
/*
* this is the length in bytes covered by the sums array below.
*/
- unsigned long len;
+ int len;
struct list_head list;
- /* last field is a variable length array of btrfs_sector_sums */
- struct btrfs_sector_sum sums[];
+ /* last field is a variable length array of csums */
+ u32 sums[];
};
/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
unsigned long bytes)
{
- unsigned long num_sectors = (bytes + root->sectorsize - 1) /
- root->sectorsize;
- num_sectors++;
- return sizeof(struct btrfs_ordered_sum) +
- num_sectors * sizeof(struct btrfs_sector_sum);
+ int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
}
static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+ int delay_iput);
void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-struct qgroup_rescan {
- struct btrfs_work work;
- struct btrfs_fs_info *fs_info;
-};
-
-static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
- struct qgroup_rescan *qscan);
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
int slot;
int ret = 0;
u64 flags = 0;
+ u64 rescan_progress = 0;
if (!fs_info->quota_enabled)
return 0;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
ptr);
- fs_info->qgroup_rescan_progress.objectid =
- btrfs_qgroup_status_rescan(l, ptr);
- if (fs_info->qgroup_flags &
- BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- struct qgroup_rescan *qscan =
- kmalloc(sizeof(*qscan), GFP_NOFS);
- if (!qscan) {
- ret = -ENOMEM;
- goto out;
- }
- fs_info->qgroup_rescan_progress.type = 0;
- fs_info->qgroup_rescan_progress.offset = 0;
- qgroup_rescan_start(fs_info, qscan);
- }
+ rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -421,9 +412,18 @@ out:
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+ ret >= 0) {
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
}
btrfs_free_path(path);
+ if (ret < 0) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
return ret < 0 ? ret : 0;
}
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
}
kfree(qgroup);
}
+ ulist_free(fs_info->qgroup_ulist);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
goto out;
}
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* initially create the quota tree
*/
@@ -916,6 +923,10 @@ out_free_root:
kfree(quota_root);
}
out:
+ if (ret) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
u64 ref_root;
struct btrfs_qgroup *qgroup;
struct ulist *roots = NULL;
- struct ulist *tmp = NULL;
u64 seq;
int ret = 0;
int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
- ret = 0;
- goto unlock;
- }
- }
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
/*
* step 1: for each old ref, visit all nodes once and inc refcnt
*/
- tmp = ulist_alloc(GFP_ATOMIC);
- if (!tmp) {
- ret = -ENOMEM;
- goto unlock;
- }
+ ulist_reinit(fs_info->qgroup_ulist);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+ ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+ seq);
if (ret)
goto unlock;
/*
* step 2: walk from the new root
*/
- ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn,
- node->num_bytes, qgroup);
+ ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes, qgroup);
if (ret)
goto unlock;
/*
* step 3: walk again from old refs
*/
- ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn,
- node->num_bytes);
+ ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
+ seq, sgn, node->num_bytes);
if (ret)
goto unlock;
unlock:
spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
ulist_free(roots);
- ulist_free(tmp);
return ret;
}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
if (!ret && start_rescan_worker) {
- ret = btrfs_qgroup_rescan(fs_info);
- if (ret)
- pr_err("btrfs: start rescan quota failed: %d\n", ret);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
ret = 0;
}
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* in a first step, we check all affected qgroups if any limits would
* be exceeded
*/
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
- ret = ulist_add(ulist, qgroup->qgroupid,
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
(uintptr_t)qgroup, GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
}
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(ulist, glist->group->qgroupid,
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
(uintptr_t)glist->group, GFP_ATOMIC);
if (ret < 0)
goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
* no limits exceeded, now record the reservation into all qgroups
*/
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
-
return ret;
}
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct ulist *ulist = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
if (!qgroup)
goto out;
- ulist = ulist_alloc(GFP_ATOMIC);
- if (!ulist) {
- btrfs_std_error(fs_info, -ENOMEM);
- goto out;
- }
- ret = ulist_add(ulist, qgroup->qgroupid,
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
(uintptr_t)qgroup, GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(ulist, &uiter))) {
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
qg->reserved -= num_bytes;
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(ulist, glist->group->qgroupid,
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
(uintptr_t)glist->group, GFP_ATOMIC);
if (ret < 0)
goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(ulist);
}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
* returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
*/
static int
-qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path,
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_trans_handle *trans, struct ulist *tmp,
struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
- struct btrfs_fs_info *fs_info = qscan->fs_info;
struct ulist *roots = NULL;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
- struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan,
- work);
+ struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+ qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_fs_info *fs_info = qscan->fs_info;
struct ulist *tmp = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(qscan, path, trans,
+ err = qgroup_rescan_leaf(fs_info, path, trans,
tmp, scratch_leaf);
}
if (err > 0)
@@ -2049,7 +2037,6 @@ out:
kfree(scratch_leaf);
ulist_free(tmp);
btrfs_free_path(path);
- kfree(qscan);
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
} else {
pr_err("btrfs: qgroup scan failed with %d\n", err);
}
-}
-static void
-qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
-{
- memset(&qscan->work, 0, sizeof(qscan->work));
- qscan->work.func = btrfs_qgroup_rescan_worker;
- qscan->fs_info = fs_info;
-
- pr_info("btrfs: qgroup scan started\n");
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
+ complete_all(&fs_info->qgroup_rescan_completion);
}
-int
-btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags)
{
int ret = 0;
- struct rb_node *n;
- struct btrfs_qgroup *qgroup;
- struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
- if (!qscan)
- return -ENOMEM;
+ if (!init_flags &&
+ (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+ ret = -EINVAL;
+ goto err;
+ }
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
- ret = -EINPROGRESS;
- else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
- ret = -EINVAL;
- if (ret) {
- spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- kfree(qscan);
- return ret;
+
+ if (init_flags) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = -EINPROGRESS;
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto err;
+ }
+
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ init_completion(&fs_info->qgroup_rescan_completion);
+
+ memset(&fs_info->qgroup_rescan_work, 0,
+ sizeof(fs_info->qgroup_rescan_work));
+ fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+
+ if (ret) {
+err:
+ pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+ spin_lock(&fs_info->qgroup_lock);
/* clear all current qgroup tracking information */
for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup->excl_cmpr = 0;
}
spin_unlock(&fs_info->qgroup_lock);
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
- qgroup_rescan_start(fs_info, qscan);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * We have set the rescan_progress to 0, which means no more
+ * delayed refs will be accounted by btrfs_qgroup_account_ref.
+ * However, btrfs_qgroup_account_ref may be right after its call
+ * to btrfs_find_all_roots, in which case it would still do the
+ * accounting.
+ * To solve this, we're committing the transaction, which will
+ * ensure we run all delayed refs and only after that, we are
+ * going to clear all tracking information for a clean start.
+ */
+
+ trans = btrfs_join_transaction(fs_info->fs_root);
+ if (IS_ERR(trans)) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
+
+ qgroup_rescan_zero_tracking(fs_info);
+
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
return 0;
}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+ int running;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+ running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (running)
+ ret = wait_for_completion_interruptible(
+ &fs_info->qgroup_rescan_completion);
+
+ return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
+ u64 last_snap = 0;
int ret;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
+ last_snap = btrfs_root_last_snapshot(&root->root_item);
btrfs_set_root_last_snapshot(&root->root_item,
trans->transid - 1);
} else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
root_item->drop_level = 0;
+ /*
+ * abuse rtransid, it is safe because it is impossible to
+ * receive data into a relocation tree.
+ */
+ btrfs_set_root_rtransid(root_item, last_snap);
+ btrfs_set_root_otransid(root_item, trans->transid);
}
btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(root_item);
- reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &root_key);
+ reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
+ u64 last_snap;
+ u64 otransid;
+ u64 objectid;
LIST_HEAD(reloc_roots);
int found = 0;
int ret = 0;
@@ -2308,12 +2319,44 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
+
+ /*
+ * we keep the old last snapshod transid in rtranid when we
+ * created the relocation tree.
+ */
+ last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+ otransid = btrfs_root_otransid(&reloc_root->root_item);
+ objectid = reloc_root->root_key.offset;
+
ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
if (ret < 0) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
+ } else if (!ret) {
+ /*
+ * recover the last snapshot tranid to avoid
+ * the space balance break NOCOW.
+ */
+ root = read_fs_root(rc->extent_root->fs_info,
+ objectid);
+ if (IS_ERR(root))
+ continue;
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ continue;
+
+ trans = btrfs_join_transaction(root);
+ BUG_ON(IS_ERR(trans));
+
+ /* Check if the fs/file tree was snapshoted or not. */
+ if (btrfs_root_last_snapshot(&root->root_item) ==
+ otransid - 1)
+ btrfs_set_root_last_snapshot(&root->root_item,
+ last_snap);
+
+ btrfs_end_transaction(trans, root);
}
}
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+ SKINNY_METADATA);
if (tree_block_processed(bytenr, blocksize, rc))
return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
+again:
key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = blocksize;
+ if (skinny) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+ }
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
if (ret < 0)
goto out;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (ret > 0) {
- if (key.objectid == bytenr &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
+ if (ret > 0 && skinny) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_METADATA_ITEM_KEY ||
+ (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == blocksize)))
+ ret = 0;
+ }
+
+ if (ret) {
+ skinny = false;
+ btrfs_release_path(path);
+ goto again;
+ }
}
BUG_ON(ret);
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->block_group->key.objectid,
(unsigned long long)rc->block_group->flags);
- ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+ ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
if (ret < 0) {
err = ret;
goto out;
}
- btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+ btrfs_wait_all_ordered_extents(fs_info, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+ reloc_root = btrfs_read_fs_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
@@ -4396,10 +4458,8 @@ out:
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
- size_t offset;
int ret;
u64 disk_bytenr;
LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
if (ret)
goto out;
+ disk_bytenr = ordered->start;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
-
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ sums->bytenr = disk_bytenr;
+ disk_bytenr += sums->len;
btrfs_add_ordered_sum(inode, ordered, sums);
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
}
/*
- * lookup the root with the highest offset for a given objectid. The key we do
- * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
*/
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
- struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key)
{
- struct btrfs_path *path;
- struct btrfs_key search_key;
struct btrfs_key found_key;
struct extent_buffer *l;
int ret;
int slot;
- search_key.objectid = objectid;
- search_key.type = BTRFS_ROOT_ITEM_KEY;
- search_key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
- BUG_ON(ret == 0);
- if (path->slots[0] == 0) {
- ret = 1;
- goto out;
+ if (search_key->offset != -1ULL) { /* the search key is exact */
+ if (ret > 0)
+ goto out;
+ } else {
+ BUG_ON(ret == 0); /* Logical error */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ ret = 0;
}
+
l = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
+
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid ||
+ if (found_key.objectid != search_key->objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- if (item)
- btrfs_read_root_item(l, slot, item);
- if (key)
- memcpy(key, &found_key, sizeof(found_key));
- ret = 0;
+ if (root_item)
+ btrfs_read_root_item(l, slot, root_item);
+ if (root_key)
+ memcpy(root_key, &found_key, sizeof(found_key));
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
return ret;
}
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed. This is any root item with an
- * offset lower than the latest root. They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_root *dead_root;
- struct btrfs_root_item *ri;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_path *path;
- int ret;
- u32 nritems;
- struct extent_buffer *leaf;
- int slot;
-
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = 0;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
-again:
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
- while (1) {
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- if (slot >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
- goto next;
-
- if (key.objectid < objectid)
- goto next;
-
- if (key.objectid > objectid)
- break;
-
- ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
- if (btrfs_disk_root_refs(leaf, ri) != 0)
- goto next;
-
- memcpy(&found_key, &key, sizeof(key));
- key.offset++;
- btrfs_release_path(path);
- dead_root =
- btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &found_key);
- if (IS_ERR(dead_root)) {
- ret = PTR_ERR(dead_root);
- goto err;
- }
-
- ret = btrfs_add_dead_root(dead_root);
- if (ret)
- goto err;
- goto again;
-next:
- slot++;
- path->slots[0]++;
- }
- ret = 0;
-err:
- btrfs_free_path(path);
- return ret;
-}
-
int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
{
struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct btrfs_root *root;
int err = 0;
int ret;
+ bool can_recover = true;
+
+ if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+ can_recover = false;
path = btrfs_alloc_path();
if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
- root = btrfs_read_fs_root_no_name(tree_root->fs_info,
- &root_key);
- if (!IS_ERR(root))
+ root = btrfs_read_fs_root(tree_root, &root_key);
+ err = PTR_RET(root);
+ if (err && err != -ENOENT) {
+ break;
+ } else if (err == -ENOENT) {
+ struct btrfs_trans_handle *trans;
+
+ btrfs_release_path(path);
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to start trans to delete "
+ "orphan item");
+ break;
+ }
+ err = btrfs_del_orphan_item(trans, tree_root,
+ root_key.objectid);
+ btrfs_end_transaction(trans, tree_root);
+ if (err) {
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to delete root orphan "
+ "item");
+ break;
+ }
continue;
+ }
- ret = PTR_ERR(root);
- if (ret != -ENOENT) {
- err = ret;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
+ err = btrfs_init_fs_root(root);
+ if (err) {
+ btrfs_free_fs_root(root);
break;
}
- ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
- if (ret) {
- err = ret;
+ root->orphan_item_inserted = 1;
+
+ err = btrfs_insert_fs_root(root->fs_info, root);
+ if (err) {
+ BUG_ON(err == -EEXIST);
+ btrfs_free_fs_root(root);
break;
}
}
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_path *path;
int ret;
- struct btrfs_root_item *ri;
- struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
goto out;
BUG_ON(ret != 0);
- leaf = path->nodes[0];
- ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
ret = btrfs_del_item(trans, root, path);
out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
- int ret = 0;
- unsigned long i;
+ unsigned long index;
unsigned long num_sectors;
while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
if (!sum)
return 0;
+ index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
num_sectors = sum->len / sctx->sectorsize;
- for (i = 0; i < num_sectors; ++i) {
- if (sum->sums[i].bytenr == logical) {
- memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
- ret = 1;
- break;
- }
- }
- if (ret && i == num_sectors - 1) {
+ memcpy(csum, sum->sums + index, sctx->csum_size);
+ if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
}
- return ret;
+ return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2501,10 +2495,11 @@ again:
ret = scrub_extent(sctx, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
- extent_physical);
+ extent_logical - logical + physical);
if (ret)
goto out;
+ scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
logical += increment;
@@ -3204,16 +3199,18 @@ out:
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
{
- unsigned long index;
struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct page *page;
struct btrfs_root *local_root;
u64 physical_for_dev_replace;
u64 len;
- struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ unsigned long index;
int srcu_index;
+ int ret;
+ int err;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
return PTR_ERR(local_root);
}
+ if (btrfs_root_refs(&local_root->root_item) == 0) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ return -ENOENT;
+ }
+
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
+ /* Avoid truncate/dio/punch hole.. */
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+
+ ret = 0;
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
len = nocow_ctx->len;
while (len >= PAGE_CACHE_SIZE) {
- struct page *page = NULL;
- int ret_sub;
-
index = offset >> PAGE_CACHE_SHIFT;
-
+again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
pr_err("find_or_create_page() failed\n");
ret = -ENOMEM;
- goto next_page;
+ goto out;
}
if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
goto next_page;
} else {
ClearPageError(page);
- ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+ err = extent_read_full_page(&BTRFS_I(inode)->
io_tree,
page, btrfs_get_extent,
nocow_ctx->mirror_num);
- if (ret_sub) {
- ret = ret_sub;
+ if (err) {
+ ret = err;
goto next_page;
}
- wait_on_page_locked(page);
+
+ lock_page(page);
+ /*
+ * If the page has been remove from the page cache,
+ * the data on it is meaningless, because it may be
+ * old one, the new data may be written into the new
+ * page in the page cache.
+ */
+ if (page->mapping != inode->i_mapping) {
+ page_cache_release(page);
+ goto again;
+ }
if (!PageUptodate(page)) {
ret = -EIO;
goto next_page;
}
}
- ret_sub = write_page_nocow(nocow_ctx->sctx,
- physical_for_dev_replace, page);
- if (ret_sub) {
- ret = ret_sub;
- goto next_page;
- }
-
+ err = write_page_nocow(nocow_ctx->sctx,
+ physical_for_dev_replace, page);
+ if (err)
+ ret = err;
next_page:
- if (page) {
- unlock_page(page);
- put_page(page);
- }
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (ret)
+ break;
+
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
-
- if (inode)
- iput(inode);
+out:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
}
}
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
{
struct fs_path *p;
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
return p;
}
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
{
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return NULL;
p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
return p;
}
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
*
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
-static int iterate_inode_ref(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key, int resolve,
iterate_inode_ref_t iterate, void *ctx)
{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
unsigned long elem_size;
unsigned long ptr;
- p = fs_path_alloc_reversed(sctx);
+ p = fs_path_alloc_reversed();
if (!p)
return -ENOMEM;
tmp_path = alloc_path_for_send();
if (!tmp_path) {
- fs_path_free(sctx, p);
+ fs_path_free(p);
return -ENOMEM;
}
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
*
* path must point to the dir item when called.
*/
-static int iterate_dir_item(struct send_ctx *sctx,
- struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *found_key,
iterate_dir_item_t iterate, void *ctx)
{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
* Retrieve the first path of an inode. If an inode has more then one
* ref/hardlink, this is ignored.
*/
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
u64 ino, struct fs_path *path)
{
int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
goto out;
}
- ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, 1,
+ __copy_first_ref, path);
if (ret < 0)
goto out;
ret = 0;
@@ -1314,8 +1312,7 @@ out:
return ret;
}
-static int read_symlink(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int read_symlink(struct btrfs_root *root,
u64 ino,
struct fs_path *dest)
{
@@ -1562,8 +1559,7 @@ out:
* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
* generation of the parent dir and the name of the dir entry.
*/
-static int get_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root, u64 ino,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
u64 *dir, u64 *dir_gen, struct fs_path *name)
{
int ret;
@@ -1628,8 +1624,7 @@ out:
return ret;
}
-static int is_first_ref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int is_first_ref(struct btrfs_root *root,
u64 ino, u64 dir,
const char *name, int name_len)
{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
u64 tmp_dir;
u64 tmp_dir_gen;
- tmp_name = fs_path_alloc(sctx);
+ tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
if (ret < 0)
goto out;
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
ret = !memcmp(tmp_name->start, name, name_len);
out:
- fs_path_free(sctx, tmp_name);
+ fs_path_free(tmp_name);
return ret;
}
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
if (!sctx->parent_root)
goto out;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name)
return -ENOMEM;
- ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+ ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
if (ret < 0)
goto out;
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
name->start, fs_path_len(name));
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
return ret;
}
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
* send_root or parent_root for ref lookup.
*/
if (ino < sctx->send_progress)
- ret = get_first_ref(sctx, sctx->send_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->send_root, ino,
+ parent_ino, parent_gen, dest);
else
- ret = get_first_ref(sctx, sctx->parent_root, ino,
- parent_ino, parent_gen, dest);
+ ret = get_first_ref(sctx->parent_root, ino,
+ parent_ino, parent_gen, dest);
if (ret < 0)
goto out;
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_gen = 0;
int stop = 0;
- name = fs_path_alloc(sctx);
+ name = fs_path_alloc();
if (!name) {
ret = -ENOMEM;
goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
}
out:
- fs_path_free(sctx, name);
+ fs_path_free(name);
if (!ret)
fs_path_unreverse(dest);
return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
verbose_printk("btrfs: send_utimes %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
btrfs_free_path(path);
return ret;
}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
verbose_printk("btrfs: send_create_inode %llu\n", ino);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (S_ISLNK(mode)) {
fs_path_reset(p);
- ret = read_symlink(sctx, sctx->send_root, ino, p);
+ ret = read_symlink(sctx->send_root, ino, p);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
return 0;
}
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
{
struct recorded_ref *cur;
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(sctx, cur->full_path);
+ fs_path_free(cur->full_path);
list_del(&cur->list);
kfree(cur);
}
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
static void free_recorded_refs(struct send_ctx *sctx)
{
- __free_recorded_refs(sctx, &sctx->new_refs);
- __free_recorded_refs(sctx, &sctx->deleted_refs);
+ __free_recorded_refs(&sctx->new_refs);
+ __free_recorded_refs(&sctx->deleted_refs);
}
/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
int ret;
struct fs_path *orphan;
- orphan = fs_path_alloc(sctx);
+ orphan = fs_path_alloc();
if (!orphan)
return -ENOMEM;
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
ret = send_rename(sctx, path, orphan);
out:
- fs_path_free(sctx, orphan);
+ fs_path_free(orphan);
return ret;
}
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
*/
BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
- valid_path = fs_path_alloc(sctx);
+ valid_path = fs_path_alloc();
if (!valid_path) {
ret = -ENOMEM;
goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
- ret = is_first_ref(sctx, sctx->parent_root,
- ow_inode, cur->dir, cur->name,
- cur->name_len);
+ ret = is_first_ref(sctx->parent_root,
+ ow_inode, cur->dir, cur->name,
+ cur->name_len);
if (ret < 0)
goto out;
if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
out:
free_recorded_refs(sctx);
ulist_free(check_dirs);
- fs_path_free(sctx, valid_path);
+ fs_path_free(valid_path);
return ret;
}
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *p;
u64 gen;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
out:
if (ret)
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, __record_new_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, __record_deleted_ref, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
return 0;
}
-static int find_iref(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_iref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
ctx.name = name;
ctx.found_idx = -1;
- ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
if (ret < 0)
return ret;
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+ ret = find_iref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, dir, name);
if (ret == -ENOENT)
ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+ ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
dir, name);
if (ret == -ENOENT)
ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, __record_changed_new_ref, sctx);
if (ret < 0)
goto out;
- ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
if (ret < 0)
goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
- sctx);
+ ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
btrfs_release_path(path);
if (ret < 0)
goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
posix_acl_xattr_header dummy_acl;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
ret = send_remove_xattr(sctx, p, name, name_len);
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
- sctx->cmp_key, __process_new_xattr, sctx);
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, __process_new_xattr, sctx);
return ret;
}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
{
int ret;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, __process_deleted_xattr, sctx);
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, __process_deleted_xattr, sctx);
return ret;
}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmalloc(data_len, GFP_NOFS);
+ ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
if (!ctx->found_data)
return -ENOMEM;
- memcpy(ctx->found_data, data, data_len);
return 1;
}
return 0;
}
-static int find_xattr(struct send_ctx *sctx,
- struct btrfs_root *root,
+static int find_xattr(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
ctx.found_data = NULL;
ctx.found_data_len = 0;
- ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+ ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
if (ret < 0)
return ret;
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
char *found_data = NULL;
int found_data_len = 0;
- ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
- sctx->cmp_key, name, name_len, &found_data,
- &found_data_len);
+ ret = find_xattr(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, name, name_len, &found_data,
+ &found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
int ret;
struct send_ctx *sctx = ctx;
- ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
- name, name_len, NULL, NULL);
+ ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
{
int ret = 0;
- ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
sctx->cmp_key, __process_changed_new_xattr, sctx);
if (ret < 0)
goto out;
- ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
sctx->cmp_key, __process_changed_deleted_xattr, sctx);
out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
goto out;
}
- ret = iterate_dir_item(sctx, root, path, &found_key,
- __process_new_xattr, sctx);
+ ret = iterate_dir_item(root, path, &found_key,
+ __process_new_xattr, sctx);
if (ret < 0)
goto out;
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int num_read = 0;
mm_segment_t old_fs;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
set_fs(old_fs);
if (ret < 0)
return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
clone_root->root->objectid, clone_root->ino,
clone_root->offset);
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
} else {
- ret = get_inode_path(sctx, clone_root->root,
- clone_root->ino, p);
+ ret = get_inode_path(clone_root->root, clone_root->ino, p);
}
if (ret < 0)
goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc(sctx);
+ p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
tlv_put_failure:
out:
- fs_path_free(sctx, p);
+ fs_path_free(p);
return ret;
}
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
send_root = BTRFS_I(file_inode(mnt_file))->root;
fs_info = send_root->fs_info;
+ /*
+ * This is done when we lookup the root, it should already be complete
+ * by the time we get here.
+ */
+ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+ /*
+ * If we just created this root we need to make sure that the orphan
+ * cleanup has been done and committed since we search the commit root,
+ * so check its commit root transid with our otransid and if they match
+ * commit the transaction to make sure everything is updated.
+ */
+ down_read(&send_root->fs_info->extent_commit_sem);
+ if (btrfs_header_generation(send_root->commit_root) ==
+ btrfs_root_otransid(&send_root->root_item)) {
+ struct btrfs_trans_handle *trans;
+
+ up_read(&send_root->fs_info->extent_commit_sem);
+
+ trans = btrfs_attach_transaction_barrier(send_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ /* ENOENT means theres no transaction */
+ } else {
+ ret = btrfs_commit_transaction(trans, send_root);
+ if (ret)
+ goto out;
+ }
+ } else {
+ up_read(&send_root->fs_info->extent_commit_sem);
+ }
+
arg = memdup_user(arg_, sizeof(*arg));
if (IS_ERR(arg)) {
ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!clone_root) {
- ret = -EINVAL;
- goto out;
- }
if (IS_ERR(clone_root)) {
ret = PTR_ERR(clone_root);
goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (!sctx->parent_root) {
- ret = -EINVAL;
+ if (IS_ERR(sctx->parent_root)) {
+ ret = PTR_ERR(sctx->parent_root);
goto out;
}
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
#include "print-tree.h"
#include "xattr.h"
#include "volumes.h"
-#include "version.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
return;
}
ACCESS_ONCE(trans->transaction->aborted) = errno;
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
@@ -776,9 +778,6 @@ find_root:
if (IS_ERR(new_root))
return ERR_CAST(new_root);
- if (btrfs_root_refs(&new_root->root_item) == 0)
- return ERR_PTR(-ENOENT);
-
dir_id = btrfs_root_dirid(&new_root->root_item);
setup_root:
location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_extents(root, 1);
+ btrfs_wait_all_ordered_extents(fs_info, 1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
}
+static void btrfs_print_info(void)
+{
+ printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+ ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ ", integrity-checker=on"
+#endif
+ "\n");
+}
+
static int __init init_btrfs_fs(void)
{
int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
btrfs_init_lockdep();
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ btrfs_print_info();
btrfs_test_free_space_cache();
-#endif
- printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
return 0;
unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
#define BTRFS_ROOT_TRANS_TAG 0
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+ [TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START),
+ [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN),
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+ [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+};
+
static void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
+ while (!list_empty(&transaction->pending_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&transaction->pending_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
root->commit_root = btrfs_root_node(root);
}
-static inline int can_join_transaction(struct btrfs_transaction *trans,
- int type)
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
- return !(trans->in_commit &&
- type != TRANS_JOIN &&
- type != TRANS_JOIN_NOLOCK);
+ return atomic_read(&trans->num_extwriters);
}
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
return -EROFS;
}
- if (fs_info->trans_no_join) {
- /*
- * If we are JOIN_NOLOCK we're already committing a current
- * transaction, we just need a handle to deal with something
- * when committing the transaction, such as inode cache and
- * space cache. It is a special case.
- */
- if (type != TRANS_JOIN_NOLOCK) {
- spin_unlock(&fs_info->trans_lock);
- return -EBUSY;
- }
- }
-
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
- if (!can_join_transaction(cur_trans, type)) {
+ if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
- cur_trans->num_joined++;
+ extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
return 0;
}
@@ -112,6 +147,12 @@ loop:
if (type == TRANS_ATTACH)
return -ENOENT;
+ /*
+ * JOIN_NOLOCK only happens during the transaction commit, so
+ * it is impossible that ->running_transaction is NULL
+ */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
- * to redo the trans_no_join checks above
+ * to redo the checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
goto loop;
@@ -131,17 +172,15 @@ loop:
}
atomic_set(&cur_trans->num_writers, 1);
- cur_trans->num_joined = 0;
+ extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->in_commit = 0;
- cur_trans->blocked = 0;
+ cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
"creating a fresh transaction\n");
atomic64_set(&fs_info->tree_mod_seq, 0);
- spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
+ INIT_LIST_HEAD(&cur_trans->pending_chunks);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
}
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+ return (trans->state >= TRANS_STATE_BLOCKED &&
+ trans->state < TRANS_STATE_UNBLOCKED &&
+ !trans->aborted);
+}
+
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
- if (cur_trans && cur_trans->blocked) {
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
atomic_inc(&cur_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
wait_event(root->fs_info->transaction_wait,
- !cur_trans->blocked);
+ cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ cur_trans->aborted);
put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
return ERR_PTR(-EROFS);
if (current->journal_info) {
- WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+ WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
* If we are ATTACH, it means we just want to catch the current
* transaction and commit it, so we needn't do sb_start_intwrite().
*/
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_start_intwrite(root->fs_info->sb);
if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ may_wait_transaction(root, type)) {
btrfs_commit_transaction(h, root);
goto again;
}
@@ -429,7 +477,7 @@ got_it:
return h;
join_fail:
- if (type < TRANS_JOIN_NOLOCK)
+ if (type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * btrfs_attach_transaction_barrier() - catch the running transaction
*
* It is similar to the above function, the differentia is this one
* will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
- wait_event(commit->commit_wait, commit->commit_done);
+ wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
}
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry_reverse(t, &root->fs_info->trans_list,
list) {
- if (t->in_commit) {
- if (t->commit_done)
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret;
+ if (root->fs_info->global_block_rsv.space_info->full &&
+ btrfs_should_throttle_delayed_refs(trans, root))
+ return 1;
- ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
- return ret ? 1 : 0;
+ return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
}
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
int err;
smp_mb();
- if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ cur_trans->delayed_refs.flushing)
return 1;
updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
- int count = 0;
+ unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- while (count < 1) {
- unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (btrfs_should_throttle_delayed_refs(trans, root)) {
+ cur = max_t(unsigned long, cur, 1);
trans->delayed_ref_updates = 0;
- if (cur &&
- trans->transaction->delayed_refs.num_heads_ready > 64) {
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- } else {
- break;
- }
- count++;
+ btrfs_run_delayed_refs(trans, root, cur);
}
btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_create_pending_block_groups(trans, root);
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
- should_end_transaction(trans, root)) {
- trans->transaction->blocked = 1;
- smp_wmb();
+ should_end_transaction(trans, root) &&
+ ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+ spin_lock(&info->trans_lock);
+ if (cur_trans->state == TRANS_STATE_RUNNING)
+ cur_trans->state = TRANS_STATE_BLOCKED;
+ spin_unlock(&info->trans_lock);
}
- if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+ if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle) {
/*
* We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
}
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
atomic_dec(&cur_trans->num_writers);
+ extwriter_counter_dec(cur_trans, trans->type);
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
- struct blk_plug plug;
- blk_start_plug(&plug);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
- blk_finish_plug(&plug);
return werr;
}
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
{
int ret;
int ret2;
+ struct blk_plug plug;
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ blk_finish_plug(&plug);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->in_commit;
+ trans = info->running_transaction;
+ if (trans)
+ ret = (trans->state >= TRANS_STATE_COMMIT_START);
spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
+ struct btrfs_transaction *trans;
int ret = 0;
+
spin_lock(&info->trans_lock);
- if (info->running_transaction)
- ret = info->running_transaction->blocked;
+ trans = info->running_transaction;
+ if (trans)
+ ret = is_transaction_blocked(trans);
spin_unlock(&info->trans_lock);
return ret;
}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
static void wait_current_trans_commit_start(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
- wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+ wait_event(root->fs_info->transaction_blocked_wait,
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ trans->aborted);
}
/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
struct btrfs_transaction *trans)
{
wait_event(root->fs_info->transaction_wait,
- trans->commit_done || (trans->in_commit && !trans->blocked));
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ trans->aborted);
}
/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->trans_lock);
- if (list_empty(&cur_trans->list)) {
- spin_unlock(&root->fs_info->trans_lock);
- btrfs_end_transaction(trans, root);
- return;
- }
+ /*
+ * If the transaction is removed from the list, it means this
+ * transaction has been committed successfully, so it is impossible
+ * to call the cleanup function.
+ */
+ BUG_ON(list_empty(&cur_trans->list));
list_del_init(&cur_trans->list);
if (cur_trans == root->fs_info->running_transaction) {
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->running_transaction = NULL;
}
spin_unlock(&root->fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, root);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->trans_lock);
+
put_transaction(cur_trans);
put_transaction(cur_trans);
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
-
- spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 0;
- spin_unlock(&root->fs_info->trans_lock);
}
static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
- int snap_pending = 0;
int ret;
- if (!flush_on_commit) {
- spin_lock(&root->fs_info->trans_lock);
- if (!list_empty(&trans->transaction->pending_snapshots))
- snap_pending = 1;
- spin_unlock(&root->fs_info->trans_lock);
- }
-
- if (flush_on_commit || snap_pending) {
- ret = btrfs_start_delalloc_inodes(root, 1);
- if (ret)
- return ret;
- btrfs_wait_ordered_extents(root, 1);
- }
-
ret = btrfs_run_delayed_items(trans, root);
if (ret)
return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * btrfs_transaction state sequence:
- * in_commit = 0, blocked = 0 (initial)
- * in_commit = 1, blocked = 1
- * blocked = 0
- * commit_done = 1
- */
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ return btrfs_start_all_delalloc_inodes(fs_info, 1);
+ return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- unsigned long joined = 0;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
- DEFINE_WAIT(wait);
int ret;
- int should_grow = 0;
- unsigned long now = get_seconds();
ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* start sending their work down.
*/
cur_trans->delayed_refs.flushing = 1;
+ smp_wmb();
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- spin_lock(&cur_trans->commit_lock);
- if (cur_trans->in_commit) {
- spin_unlock(&cur_trans->commit_lock);
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans, root);
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- trans->transaction->in_commit = 1;
- trans->transaction->blocked = 1;
- spin_unlock(&cur_trans->commit_lock);
+ cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
- spin_lock(&root->fs_info->trans_lock);
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (!prev_trans->commit_done) {
+ if (prev_trans->state != TRANS_STATE_COMPLETED) {
atomic_inc(&prev_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
}
- if (!btrfs_test_opt(root, SSD) &&
- (now < cur_trans->start_time || now - cur_trans->start_time < 1))
- should_grow = 1;
-
- do {
- joined = cur_trans->num_joined;
-
- WARN_ON(cur_trans != trans->transaction);
-
- ret = btrfs_flush_all_pending_stuffs(trans, root);
- if (ret)
- goto cleanup_transaction;
+ extwriter_counter_dec(cur_trans, trans->type);
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ ret = btrfs_start_delalloc_flush(root->fs_info);
+ if (ret)
+ goto cleanup_transaction;
- if (atomic_read(&cur_trans->num_writers) > 1)
- schedule_timeout(MAX_SCHEDULE_TIMEOUT);
- else if (should_grow)
- schedule_timeout(1);
+ ret = btrfs_flush_all_pending_stuffs(trans, root);
+ if (ret)
+ goto cleanup_transaction;
- finish_wait(&cur_trans->writer_wait, &wait);
- } while (atomic_read(&cur_trans->num_writers) > 1 ||
- (should_grow && cur_trans->num_joined != joined));
+ wait_event(cur_trans->writer_wait,
+ extwriter_counter_read(cur_trans) == 0);
+ /* some pending stuffs might be added after the previous flush. */
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
goto cleanup_transaction;
+ btrfs_wait_delalloc_flush(root->fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
- * no_join so make sure to wait for num_writers to == 1 again.
+ * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&root->fs_info->trans_lock);
- root->fs_info->trans_no_join = 1;
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&root->fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
- trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
- root->fs_info->trans_no_join = 0;
spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->reloc_mutex);
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- cur_trans->commit_done = 1;
-
root->fs_info->last_trans_committed = cur_trans->transid;
-
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
put_transaction(cur_trans);
put_transaction(cur_trans);
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
- if (fs_info->sb->s_flags & MS_RDONLY) {
- pr_debug("btrfs: cleaner called for RO fs!\n");
- return 0;
- }
-
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
#include "delayed-ref.h"
#include "ctree.h"
+enum btrfs_trans_state {
+ TRANS_STATE_RUNNING = 0,
+ TRANS_STATE_BLOCKED = 1,
+ TRANS_STATE_COMMIT_START = 2,
+ TRANS_STATE_COMMIT_DOING = 3,
+ TRANS_STATE_UNBLOCKED = 4,
+ TRANS_STATE_COMPLETED = 5,
+ TRANS_STATE_MAX = 6,
+};
+
struct btrfs_transaction {
u64 transid;
/*
+ * total external writers(USERSPACE/START/ATTACH) in this
+ * transaction, it must be zero before the transaction is
+ * being committed
+ */
+ atomic_t num_extwriters;
+ /*
* total writers in this transaction, it must be zero before the
* transaction can end
*/
atomic_t num_writers;
atomic_t use_count;
- unsigned long num_joined;
-
- spinlock_t commit_lock;
- int in_commit;
- int commit_done;
- int blocked;
+ /* Be protected by fs_info->trans_lock when we want to change it. */
+ enum btrfs_trans_state state;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head ordered_operations;
+ struct list_head pending_chunks;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
-enum btrfs_trans_type {
- TRANS_START,
- TRANS_JOIN,
- TRANS_USERSPACE,
- TRANS_JOIN_NOLOCK,
- TRANS_ATTACH,
-};
+#define __TRANS_FREEZABLE (1U << 0)
+
+#define __TRANS_USERSPACE (1U << 8)
+#define __TRANS_START (1U << 9)
+#define __TRANS_ATTACH (1U << 10)
+#define __TRANS_JOIN (1U << 11)
+#define __TRANS_JOIN_NOLOCK (1U << 12)
+
+#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH (__TRANS_ATTACH)
+#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
+ __TRANS_ATTACH)
struct btrfs_trans_handle {
u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
- enum btrfs_trans_type type;
+ unsigned int type;
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include "ctree.h"
#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
{
int ret = 0;
+ /*
+ * If this fs is mixed then we need to be able to process the leaves to
+ * pin down any logged extents, so we have to read the block.
+ */
+ if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
+ }
+
if (wc->pin)
ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
eb->start, eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+ if (wc->pin && btrfs_header_level(eb) == 0)
+ ret = btrfs_exclude_logged_extents(log, eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
- if (ret && ret != -ENOENT)
- break;
- ret = 0;
- } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ } else if (key.type == BTRFS_INODE_REF_KEY ||
+ key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
unsigned long log_transid = 0;
+ struct blk_plug plug;
mutex_lock(&root->log_mutex);
log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
+ blk_start_plug(&plug);
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
+ blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ blk_finish_plug(&plug);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- ret = btrfs_write_and_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ ret = btrfs_write_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ blk_finish_plug(&plug);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
btrfs_wait_logged_extents(log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root_no_radix(log_root_tree,
- &found_key);
+ log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 new_alloced = ulist->nodes_alloced + 128;
struct ulist_node *new_nodes;
void *old = NULL;
+ int i;
+
+ for (i = 0; i < ulist->nnodes; i++)
+ rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
/*
* if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
ulist->nodes = new_nodes;
ulist->nodes_alloced = new_alloced;
+
+ /*
+ * krealloc actually uses memcpy, which does not copy rb_node
+ * pointers, so we have to do it ourselves. Otherwise we may
+ * be bitten by crashes.
+ */
+ for (i = 0; i < ulist->nnodes; i++) {
+ ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+ if (ret < 0)
+ return ret;
+ }
}
ulist->nodes[ulist->nnodes].val = val;
ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
return ret;
}
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 *start, u64 len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+ struct map_lookup *map;
+ int i;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev != device)
+ continue;
+ if (map->stripes[i].physical >= *start + len ||
+ map->stripes[i].physical + em->orig_block_len <=
+ *start)
+ continue;
+ *start = map->stripes[i].physical +
+ em->orig_block_len;
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+
/*
* find_free_dev_extent - find free space in the specified device
* @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
*/
search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
max_hole_start = search_start;
max_hole_size = 0;
hole_size = 0;
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
- goto error;
+ goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto error;
- }
path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
if (key.offset > search_start) {
hole_size = key.offset - search_start;
+ /*
+ * Have to check before we set max_hole_start, otherwise
+ * we could end up sending back this offset anyway.
+ */
+ if (contains_pending_extent(trans, device,
+ &search_start,
+ hole_size))
+ hole_size = 0;
+
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
max_hole_size = hole_size;
}
+ if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
out:
btrfs_free_path(path);
-error:
*start = max_hole_start;
if (len)
*len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
return ret;
}
-static noinline int find_next_chunk(struct btrfs_root *root,
- u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
- int ret;
- struct btrfs_key key;
- struct btrfs_chunk *chunk;
- struct btrfs_key found_key;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = objectid;
- key.offset = (u64)-1;
- key.type = BTRFS_CHUNK_ITEM_KEY;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
-
- BUG_ON(ret == 0); /* Corruption */
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
- ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
- if (ret) {
- *offset = 0;
- } else {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
- if (found_key.objectid != objectid)
- *offset = 0;
- else {
- chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_chunk);
- *offset = found_key.offset +
- btrfs_chunk_length(path->nodes[0], chunk);
- }
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
}
- ret = 0;
-error:
- btrfs_free_path(path);
+ read_unlock(&em_tree->lock);
+
return ret;
}
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
- printk(KERN_ERR "btrfs: unable to go below four devices "
- "on raid10\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid1\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
root->fs_info->fs_devices->rw_devices <= 2) {
- printk(KERN_ERR "btrfs: unable to go below two "
- "devices on raid5\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
root->fs_info->fs_devices->rw_devices <= 3) {
- printk(KERN_ERR "btrfs: unable to go below three "
- "devices on raid6\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
goto out;
}
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bh = NULL;
disk_super = NULL;
if (!device) {
- printk(KERN_ERR "btrfs: no missing devices found to "
- "remove\n");
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
goto out;
}
} else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
if (device->is_tgtdev_for_dev_replace) {
- pr_err("btrfs: unable to remove the dev_replace target dev\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto error_brelse;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
- printk(KERN_ERR "btrfs: unable to remove the only writeable "
- "device\n");
- ret = -EINVAL;
+ ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto error_brelse;
}
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
- if (IS_ERR(tsk))
- return PTR_ERR(tsk);
-
- return 0;
+ return PTR_RET(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes_out, u64 *stripe_size_out,
- u64 start, u64 type)
+ struct btrfs_root *extent_root, u64 start,
+ u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(device,
+ ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- *map_ret = map;
num_bytes = stripe_size * data_stripes;
- *stripe_size_out = stripe_size;
- *num_bytes_out = num_bytes;
-
trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
+ em->orig_block_len = stripe_size;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
+ if (!ret) {
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ atomic_inc(&em->refs);
+ }
write_unlock(&em_tree->lock);
if (ret) {
free_extent_map(em);
goto error;
}
- for (i = 0; i < map->num_stripes; ++i) {
- struct btrfs_device *device;
- u64 dev_offset;
-
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device,
- info->chunk_root->root_key.objectid,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, dev_offset, stripe_size);
- if (ret)
- goto error_dev_extent;
- }
-
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- if (ret) {
- i = map->num_stripes - 1;
- goto error_dev_extent;
- }
+ if (ret)
+ goto error_del_extent;
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
kfree(devices_info);
return 0;
-error_dev_extent:
- for (; i >= 0; i--) {
- struct btrfs_device *device;
- int err;
-
- device = map->stripes[i].dev;
- err = btrfs_free_dev_extent(trans, device, start);
- if (err) {
- btrfs_abort_transaction(trans, extent_root, err);
- break;
- }
- }
+error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
return ret;
}
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
- struct map_lookup *map, u64 chunk_offset,
- u64 chunk_size, u64 stripe_size)
+ u64 chunk_offset, u64 chunk_size)
{
- u64 dev_offset;
struct btrfs_key key;
struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- size_t item_size = btrfs_chunk_item_size(map->num_stripes);
- int index = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
int ret;
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(extent_root->fs_info, "unable to find logical "
+ "%Lu len %Lu", chunk_offset, chunk_size);
+ return -EINVAL;
+ }
+
+ if (em->start != chunk_offset || em->len != chunk_size) {
+ btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+ " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ chunk_size, em->start, em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk)
- return -ENOMEM;
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
- index = 0;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out_free;
- index++;
+ goto out;
+ ret = btrfs_alloc_dev_extent(trans, device,
+ chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ goto out;
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);
- index = 0;
stripe = &chunk->stripe;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- dev_offset = map->stripes[index].physical;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
- index++;
}
btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
}
-out_free:
+out:
kfree(chunk);
+ free_extent_map(em);
return ret;
}
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type)
{
u64 chunk_offset;
- u64 chunk_size;
- u64 stripe_size;
- struct map_lookup *map;
- struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
- int ret;
-
- ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- &chunk_offset);
- if (ret)
- return ret;
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, type);
- if (ret)
- return ret;
-
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret)
- return ret;
- return 0;
+ chunk_offset = find_next_chunk(extent_root->fs_info);
+ return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
u64 sys_chunk_offset;
- u64 chunk_size;
- u64 sys_chunk_size;
- u64 stripe_size;
- u64 sys_stripe_size;
u64 alloc_profile;
- struct map_lookup *map;
- struct map_lookup *sys_map;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
- ret = find_next_chunk(fs_info->chunk_root,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
- if (ret)
- return ret;
-
+ chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+ alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = chunk_offset + chunk_size;
-
+ sys_chunk_offset = find_next_chunk(root->fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
- &sys_chunk_size, &sys_stripe_size,
- sys_chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ alloc_profile);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- /*
- * Modifying chunk tree needs allocating new blocks from both
- * system block group and metadata block group. So we only can
- * do operations require modifying the chunk tree after both
- * block groups were created.
- */
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- ret = __finish_chunk_alloc(trans, extent_root, sys_map,
- sys_chunk_offset, sys_chunk_size,
- sys_stripe_size);
if (ret)
btrfs_abort_transaction(trans, root, ret);
-
out:
-
return ret;
}
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
- if (mirror_num > map->num_stripes)
- mirror_num = 0;
-
stripe_len = map->stripe_len;
stripe_nr = offset;
/*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
return NULL;
list_add(&device->dev_list,
&fs_devices->devices);
- device->dev_root = root->fs_info->dev_root;
device->devid = devid;
device->work.func = pending_bios_fn;
device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
}
fill_device_from_item(leaf, dev_item, device);
- device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
return ret;
}
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_root *root,
struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
struct btrfs_mapping_tree *map_tree,
u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(unlock_buffer);
/*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct buffer_head *head, *bh;
+ *dirty = false;
+ *writeback = false;
+
+ BUG_ON(!PageLocked(page));
+
+ if (!page_has_buffers(page))
+ return;
+
+ if (PageWriteback(page))
+ *writeback = true;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (buffer_locked(bh))
+ *writeback = true;
+
+ if (buffer_dirty(bh))
+ *dirty = true;
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
* Block until a buffer comes unlocked. This doesn't stop it
* from becoming locked again - you have to lock it yourself
* if you want to preserve its state.
@@ -1454,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh)
* block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
* block_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -1465,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh)
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
+ unsigned int stop = length + offset;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
goto out;
+ /*
+ * Check for overflow
+ */
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
@@ -1481,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
next = bh->b_this_page;
/*
+ * Are we still fully in range ?
+ */
+ if (next_off > stop)
+ goto out;
+
+ /*
* is this block fully invalidated?
*/
if (offset <= curr_off)
@@ -1501,6 +1549,7 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
#include <linux/mount.h>
#include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
struct cachefiles_lookup_data {
struct cachefiles_xattr *auxdata; /* auxiliary data */
char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
object = container_of(_object, struct cachefiles_object, fscache);
cache = container_of(object->fscache.cache, struct cachefiles_cache,
cache);
+
+ if (!fscache_use_cookie(_object)) {
+ _leave(" [relinq]");
+ return;
+ }
+
cookie = object->fscache.cookie;
if (!cookie->def->get_aux) {
+ fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
+ fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ fscache_unuse_cookie(_object);
ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
#endif
/* delete retired objects */
- if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+ if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobject: OBJ%x\n",
prefix, object->fscache.debug_id);
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
- prefix, fscache_object_states[object->fscache.state],
+ prefix, object->fscache.state->name,
object->fscache.flags, work_busy(&object->fscache.work),
object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
found_dentry:
kdebug("preemptive burial: OBJ%x [%s] %p",
object->fscache.debug_id,
- fscache_object_states[object->fscache.state],
+ object->fscache.state->name,
dentry);
- if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
- if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
/* look up the victim */
- mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/swap.h>
#include "internal.h"
/*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
*/
static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
struct fscache_retrieval *op,
- struct page *netpage,
- struct pagevec *pagevec)
+ struct page *netpage)
{
struct cachefiles_one_read *monitor;
struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
_enter("");
- pagevec_reinit(pagevec);
-
_debug("read back %p{%lu,%d}",
netpage, netpage->index, page_count(netpage));
@@ -283,9 +281,7 @@ installed_new_backing_page:
backpage = newpage;
newpage = NULL;
- page_cache_get(backpage);
- pagevec_add(pagevec, backpage);
- __pagevec_lru_add_file(pagevec);
+ lru_cache_add_file(backpage);
read_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
if (block) {
/* submit the apparently valid page to the backing fs to be
* read from disk */
- ret = cachefiles_read_backing_file_one(object, op, page,
- &pagevec);
+ ret = cachefiles_read_backing_file_one(object, op, page);
} else if (cachefiles_has_space(cache, 0, 1) == 0) {
/* there's space in the cache we can use */
fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
{
struct cachefiles_one_read *monitor = NULL;
struct address_space *bmapping = object->backer->d_inode->i_mapping;
- struct pagevec lru_pvec;
struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
int ret = 0;
_enter("");
- pagevec_init(&lru_pvec, 0);
-
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
backpage = newpage;
newpage = NULL;
- page_cache_get(backpage);
- if (!pagevec_add(&lru_pvec, backpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(backpage);
reread_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto nomem;
}
- page_cache_get(netpage);
- if (!pagevec_add(&lru_pvec, netpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(netpage);
/* install a monitor */
page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
fscache_mark_page_cached(op, netpage);
- page_cache_get(netpage);
- if (!pagevec_add(&lru_pvec, netpage))
- __pagevec_lru_add_file(&lru_pvec);
+ lru_cache_add_file(netpage);
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
out:
/* tidy up */
- pagevec_lru_add_file(&lru_pvec);
-
if (newpage)
page_cache_release(newpage);
if (netpage)
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
* dirty page counters appropriately. Only called if there is private
* data on the page.
*/
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
page->private = 0;
ClearPagePrivate(page);
} else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
+ dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+ inode, page, page->index, offset, length);
}
}
@@ -438,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
- loff_t page_off = page_offset(page);
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
struct ceph_snap_context *snapc, *oldest;
- u64 snap_size = 0;
+ loff_t page_off = page_offset(page);
long writeback_stat;
+ u64 truncate_size, snap_size = 0;
+ u32 truncate_seq;
+ int err = 0, len = PAGE_CACHE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -474,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
/* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
+ if (page_off >= snap_size) {
+ dout("%p page eof %llu\n", page, snap_size);
+ goto out;
+ }
+ if (snap_size < page_off + len)
+ len = snap_size - page_off;
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
@@ -494,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
+ truncate_seq, truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -631,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
ceph_osdc_put_request(req);
}
-static struct ceph_osd_request *
-ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
- struct ceph_snap_context *snapc, int num_ops)
-{
- struct ceph_fs_client *fsc;
- struct ceph_inode_info *ci;
- struct ceph_vino vino;
-
- fsc = ceph_inode_to_client(inode);
- ci = ceph_inode(inode);
- vino = ceph_vino(inode);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-
- return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
- snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
-}
-
/*
* initiate async writeback
*/
@@ -658,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_vino vino = ceph_vino(inode);
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -670,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync;
- u64 snap_size;
+ u64 truncate_size, snap_size;
+ u32 truncate_seq;
/*
* Include a 'sync' in the OSD request if this is a data
* integrity write (e.g., O_SYNC write or fsync()), or if our
* cap is being revoked.
*/
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ if ((wbc->sync_mode == WB_SYNC_ALL) ||
+ ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
do_sync = 1;
dout("writepages_start %p dosync=%d (mode=%s)\n",
inode, do_sync,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- fsc = ceph_inode_to_client(inode);
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
@@ -728,6 +717,14 @@ retry:
snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
+
+ spin_lock(&ci->i_ceph_lock);
+ truncate_seq = ci->i_truncate_seq;
+ truncate_size = ci->i_truncate_size;
+ if (!snap_size)
+ snap_size = i_size_read(inode);
+ spin_unlock(&ci->i_ceph_lock);
+
if (last_snapc && snapc != last_snapc) {
/* if we switched to a newer snapc, restart our scan at the
* start of the original file range. */
@@ -739,7 +736,6 @@ retry:
while (!done && index <= end) {
int num_ops = do_sync ? 2 : 1;
- struct ceph_vino vino;
unsigned i;
int first;
pgoff_t next;
@@ -833,17 +829,18 @@ get_more_pages:
* that it will use.
*/
if (locked_pages == 0) {
- size_t size;
-
BUG_ON(pages);
-
/* prepare async write request */
offset = (u64)page_offset(page);
len = wsize;
- req = ceph_writepages_osd_request(inode,
- offset, &len, snapc,
- num_ops);
-
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
unlock_page(page);
@@ -854,8 +851,8 @@ get_more_pages:
req->r_inode = inode;
max_pages = calc_pages_for(0, (u64)len);
- size = max_pages * sizeof (*pages);
- pages = kmalloc(size, GFP_NOFS);
+ pages = kmalloc(max_pages * sizeof (*pages),
+ GFP_NOFS);
if (!pages) {
pool = fsc->wb_pagevec_pool;
pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
int have;
int alloc = 0;
LIST_HEAD(newcaps);
- int ret = 0;
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
for (i = have; i < need; i++) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
+ if (!cap)
+ break;
list_add(&cap->caps_item, &newcaps);
alloc++;
}
- BUG_ON(have + alloc != need);
+ /* we didn't manage to reserve as much as we needed */
+ if (have + alloc != need)
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
__cap_delay_requeue(mdsc, ci);
}
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap) {
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (ci->i_auth_cap == NULL ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ci->i_auth_cap = cap;
+ } else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (implemented)
*implemented |= cap->implemented;
}
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
return have;
}
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
{
- struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct rb_node *p;
- int ret = 0;
- spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
+ if (cap != ocap && __cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
}
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
__ceph_flush_snaps(ci, &session, 1);
+
if (ci->i_flushing_caps) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
- __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR));
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
}
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
} else {
dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
cap->issued = newcaps;
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
(cap->issued & unless) == 0)) {
if ((cap->issued & drop) &&
(cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
cap->issued &= ~drop;
cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
+ cap->mds_wanted = wanted;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a40ceda47a32..868b61d56cac 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -793,6 +793,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_SHARED on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
err = ceph_mdsc_do_request(mdsc, dir, req);
if (err) {
d_drop(dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 16c989d3e23c..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
hold_mutex = true;
@@ -809,7 +808,6 @@ retry_snap:
out:
if (hold_mutex)
mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
int ret;
mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e20d62e..2f5edff2f980 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
- dn, dn->d_count,
- realdn, realdn->d_count,
+ dn, d_count(dn),
+ realdn, d_count(realdn),
realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
@@ -1465,7 +1465,16 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode;
dout("vmtruncate_work %p\n", inode);
- __ceph_do_pending_vmtruncate(inode, true);
+ if (!mutex_trylock(&inode->i_mutex)) {
+ /*
+ * the i_mutex can be hold by a writer who is waiting for
+ * caps. wake up waiters, they will do pending vmtruncate.
+ */
+ wake_up_all(&ci->i_cap_wq);
+ mutex_lock(&inode->i_mutex);
+ }
+ __ceph_do_pending_vmtruncate(inode);
+ mutex_unlock(&inode->i_mutex);
iput(inode);
}
@@ -1492,7 +1501,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
* Make sure any pending truncation is applied before doing anything
* that may depend on it.
*/
-void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock)
+void __ceph_do_pending_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
@@ -1525,11 +1534,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- if (needlock)
- mutex_lock(&inode->i_mutex);
truncate_inode_pages(inode->i_mapping, to);
- if (needlock)
- mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
@@ -1588,7 +1593,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
err = inode_change_ok(inode, attr);
if (err != 0)
@@ -1770,7 +1775,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode, false);
+ __ceph_do_pending_vmtruncate(inode);
return err;
out:
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e0b4ef31d3c8..a5ce62eb7806 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -196,8 +196,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
- if (r < 0)
+ if (r < 0) {
+ up_read(&osdc->map_sem);
return -EIO;
+ }
dl.file_offset -= dl.object_offset;
dl.object_size = ceph_file_layout_object_size(ci->i_layout);
dl.block_size = ceph_file_layout_su(ci->i_layout);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 690f73f42425..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
}
/**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
* counter variables, so you can prepare pagelist metadata before calling
* ceph_encode_locks.
*/
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 74fd2898b2ab..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
num = le32_to_cpu(head->num);
dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
session->s_num_cap_releases += num;
/* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, dentry->d_count, *base, len, path);
+ dentry, d_count(dentry), *base, len, path);
return path;
}
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
+ cap->mseq = 0; /* and migrate_seq */
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL)
+ if (mdsc->mdsmap == NULL) {
+ kfree(mdsc);
return -ENOMEM;
+ }
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
+ struct ceph_mds_info *info;
ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].global_id = global_id;
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].laggy =
- (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- m->m_info[mds].export_targets[j] =
- ceph_decode_32(&pexport_targets);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
+
+ if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ continue;
+
+ info = &m->m_info[mds];
+ info->global_id = global_id;
+ info->state = state;
+ info->addr = addr;
+ info->laggy = (laggy_since.tv_sec != 0 ||
+ laggy_since.tv_nsec != 0);
+ info->num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ info->export_targets = kcalloc(num_export_targets,
+ sizeof(u32), GFP_NOFS);
+ if (info->export_targets == NULL)
+ goto badmem;
+ for (j = 0; j < num_export_targets; j++)
+ info->export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ info->export_targets = NULL;
}
}
@@ -170,7 +174,7 @@ bad:
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(err);
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
}
err = -EINVAL;
dev_name_end--; /* back up to ':' separator */
- if (*dev_name_end != ':') {
+ if (dev_name_end < dev_name || *dev_name_end != ':') {
pr_err("device name is missing path (no : separator in %s)\n",
dev_name);
goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4aea2e..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
- goto out;
+ return err;
}
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2906ee276408..603f18a65c12 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
select CRYPTO_ECB
select CRYPTO_DES
select CRYPTO_SHA256
+ select CRYPTO_CMAC
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d59748346020..f3ac4154cbb6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
tcon->nativeFileSystem);
}
seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
- "\nPathComponentMax: %d Status: 0x%d",
+ "\n\tPathComponentMax: %d Status: 0x%d",
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -224,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
+ if (server->ops->dump_share_caps)
+ server->ops->dump_share_caps(m, tcon);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
@@ -595,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
return single_open(file, cifs_security_flags_proc_show, NULL);
}
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
+{
+ unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+
+ if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+ *flags = CIFSSEC_MUST_KRB5;
+ else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+ *flags = CIFSSEC_MUST_NTLMSSP;
+ else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+ *flags = CIFSSEC_MUST_NTLMV2;
+ else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+ *flags = CIFSSEC_MUST_NTLM;
+ else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+ *flags = CIFSSEC_MUST_LANMAN;
+ else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+ *flags = CIFSSEC_MUST_PLNTXT;
+
+ *flags |= signflags;
+}
+
static ssize_t cifs_security_flags_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
+ int rc;
unsigned int flags;
char flags_string[12];
char c;
@@ -620,26 +649,35 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
global_secflags = CIFSSEC_MAX;
return count;
} else if (!isdigit(c)) {
- cifs_dbg(VFS, "invalid flag %c\n", c);
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+ flags_string);
return -EINVAL;
}
}
- /* else we have a number */
- flags = simple_strtoul(flags_string, NULL, 0);
+ /* else we have a number */
+ rc = kstrtouint(flags_string, 0, &flags);
+ if (rc) {
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+ flags_string);
+ return rc;
+ }
cifs_dbg(FYI, "sec flags 0x%x\n", flags);
- if (flags <= 0) {
- cifs_dbg(VFS, "invalid security flags %s\n", flags_string);
+ if (flags == 0) {
+ cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
return -EINVAL;
}
if (flags & ~CIFSSEC_MASK) {
- cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n",
+ cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
flags & ~CIFSSEC_MASK);
return -EINVAL;
}
+
+ cifs_security_flags_handle_must_flags(&flags);
+
/* flags look ok - update the global security flags for cifs module */
global_secflags = flags;
if (global_secflags & CIFSSEC_MUST_SIGN) {
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e21..fe8d6276410a 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
/*
* UniStrupr: Upper case a unicode string
*/
-static inline wchar_t *
-UniStrupr(register wchar_t *upin)
+static inline __le16 *
+UniStrupr(register __le16 *upin)
{
- register wchar_t *up;
+ register __le16 *up;
up = upin;
while (*up) { /* For all characters */
- *up = UniToupper(*up);
+ *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
up++;
}
return upin; /* Return input pointer */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 71436d1fca13..fc6f4f3a1a9d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
/*
* fs/cifs/cifsencrypt.c
*
- * Copyright (C) International Business Machines Corp., 2005,2006
+ * Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -31,6 +31,37 @@
#include <linux/random.h>
#include <linux/highmem.h>
+static int
+cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
+{
+ int rc;
+ unsigned int size;
+
+ if (server->secmech.sdescmd5 != NULL)
+ return 0; /* already allocated */
+
+ server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(server->secmech.md5)) {
+ cifs_dbg(VFS, "could not allocate crypto md5\n");
+ rc = PTR_ERR(server->secmech.md5);
+ server->secmech.md5 = NULL;
+ return rc;
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.md5);
+ server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdescmd5) {
+ crypto_free_shash(server->secmech.md5);
+ server->secmech.md5 = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+ server->secmech.sdescmd5->shash.flags = 0x0;
+
+ return 0;
+}
+
/*
* Calculate and return the CIFS signature based on the mac key and SMB PDU.
* The 16 byte signature must be allocated by the caller. Note we only use the
@@ -50,8 +81,11 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
return -EINVAL;
if (!server->secmech.sdescmd5) {
- cifs_dbg(VFS, "%s: Can't generate signature\n", __func__);
- return -1;
+ rc = cifs_crypto_shash_md5_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+ return -1;
+ }
}
rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
@@ -276,7 +310,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
- memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
memcpy(lnm_session_key, password_with_pad,
CIFS_ENCPWD_SIZE);
return 0;
@@ -389,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (blobptr + attrsize > blobend)
break;
if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
- if (!attrsize)
+ if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
break;
if (!ses->domainName) {
ses->domainName =
@@ -414,7 +447,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
int rc = 0;
int len;
char nt_hash[CIFS_NTHASH_SIZE];
- wchar_t *user;
+ __le16 *user;
wchar_t *domain;
wchar_t *server;
@@ -439,7 +472,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
return rc;
}
- /* convert ses->user_name to unicode and uppercase */
+ /* convert ses->user_name to unicode */
len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
if (user == NULL) {
@@ -448,7 +481,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
if (len) {
- len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+ len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
UniStrupr(user);
} else {
memset(user, '\0', 2);
@@ -536,7 +569,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
- if (ses->server->secType == RawNTLMSSP)
+ if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
memcpy(ses->auth_key.response + offset,
ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
@@ -557,6 +590,36 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
+static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
+{
+ int rc;
+ unsigned int size;
+
+ /* check if already allocated */
+ if (server->secmech.sdeschmacmd5)
+ return 0;
+
+ server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+ if (IS_ERR(server->secmech.hmacmd5)) {
+ cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
+ rc = PTR_ERR(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
+ return rc;
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.hmacmd5);
+ server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdeschmacmd5) {
+ crypto_free_shash(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+ server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+ return 0;
+}
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
@@ -568,7 +631,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
char ntlmv2_hash[16];
unsigned char *tiblob = NULL; /* target info blob */
- if (ses->server->secType == RawNTLMSSP) {
+ if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
if (!ses->domainName) {
rc = find_domain_name(ses, nls_cp);
if (rc) {
@@ -607,6 +670,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+ rc = crypto_hmacmd5_alloc(ses->server);
+ if (rc) {
+ cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
+ goto setup_ntlmv2_rsp_ret;
+ }
+
/* calculate ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
@@ -706,94 +775,32 @@ calc_seckey(struct cifs_ses *ses)
void
cifs_crypto_shash_release(struct TCP_Server_Info *server)
{
- if (server->secmech.hmacsha256)
- crypto_free_shash(server->secmech.hmacsha256);
-
- if (server->secmech.md5)
- crypto_free_shash(server->secmech.md5);
-
- if (server->secmech.hmacmd5)
- crypto_free_shash(server->secmech.hmacmd5);
-
- kfree(server->secmech.sdeschmacsha256);
-
- kfree(server->secmech.sdeschmacmd5);
-
- kfree(server->secmech.sdescmd5);
-}
-
-int
-cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- int rc;
- unsigned int size;
-
- server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
- if (IS_ERR(server->secmech.hmacmd5)) {
- cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
- return PTR_ERR(server->secmech.hmacmd5);
- }
-
- server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(server->secmech.md5)) {
- cifs_dbg(VFS, "could not allocate crypto md5\n");
- rc = PTR_ERR(server->secmech.md5);
- goto crypto_allocate_md5_fail;
+ if (server->secmech.cmacaes) {
+ crypto_free_shash(server->secmech.cmacaes);
+ server->secmech.cmacaes = NULL;
}
- server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
- if (IS_ERR(server->secmech.hmacsha256)) {
- cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
- rc = PTR_ERR(server->secmech.hmacsha256);
- goto crypto_allocate_hmacsha256_fail;
- }
-
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacmd5);
- server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacmd5) {
- rc = -ENOMEM;
- goto crypto_allocate_hmacmd5_sdesc_fail;
+ if (server->secmech.hmacsha256) {
+ crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
}
- server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
- server->secmech.sdeschmacmd5->shash.flags = 0x0;
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.md5);
- server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdescmd5) {
- rc = -ENOMEM;
- goto crypto_allocate_md5_sdesc_fail;
+ if (server->secmech.md5) {
+ crypto_free_shash(server->secmech.md5);
+ server->secmech.md5 = NULL;
}
- server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
- server->secmech.sdescmd5->shash.flags = 0x0;
- size = sizeof(struct shash_desc) +
- crypto_shash_descsize(server->secmech.hmacsha256);
- server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
- if (!server->secmech.sdeschmacsha256) {
- rc = -ENOMEM;
- goto crypto_allocate_hmacsha256_sdesc_fail;
+ if (server->secmech.hmacmd5) {
+ crypto_free_shash(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
}
- server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
- server->secmech.sdeschmacsha256->shash.flags = 0x0;
-
- return 0;
-
-crypto_allocate_hmacsha256_sdesc_fail:
- kfree(server->secmech.sdescmd5);
-crypto_allocate_md5_sdesc_fail:
+ kfree(server->secmech.sdesccmacaes);
+ server->secmech.sdesccmacaes = NULL;
+ kfree(server->secmech.sdeschmacsha256);
+ server->secmech.sdeschmacsha256 = NULL;
kfree(server->secmech.sdeschmacmd5);
-
-crypto_allocate_hmacmd5_sdesc_fail:
- crypto_free_shash(server->secmech.hmacsha256);
-
-crypto_allocate_hmacsha256_fail:
- crypto_free_shash(server->secmech.md5);
-
-crypto_allocate_md5_fail:
- crypto_free_shash(server->secmech.hmacmd5);
-
- return rc;
+ server->secmech.sdeschmacmd5 = NULL;
+ kfree(server->secmech.sdescmd5);
+ server->secmech.sdescmd5 = NULL;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a445e71746fa..85ea98d139fc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb)
goto out_no_root;
}
+ if (cifs_sb_master_tcon(cifs_sb)->nocase)
+ sb->s_d_op = &cifs_ci_dentry_ops;
+ else
+ sb->s_d_op = &cifs_dentry_ops;
+
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
rc = -ENOMEM;
goto out_no_root;
}
- /* do that *after* d_make_root() - we want NULL ->d_op for root here */
- if (cifs_sb_master_tcon(cifs_sb)->nocase)
- sb->s_d_op = &cifs_ci_dentry_ops;
- else
- sb->s_d_op = &cifs_dentry_ops;
-
#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifs_dbg(FYI, "export ops supported\n");
@@ -312,11 +311,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
}
static void
-cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
{
+ if (ses->sectype == Unspecified)
+ return;
+
seq_printf(s, ",sec=");
- switch (server->secType) {
+ switch (ses->sectype) {
case LANMAN:
seq_printf(s, "lanman");
break;
@@ -338,7 +340,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
break;
}
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->sign)
seq_printf(s, "i");
}
@@ -369,7 +371,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
- cifs_show_security(s, tcon->ses->server);
+ cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d05b3028e3b9..ea723a5e8226 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.0"
+#define CIFS_VERSION "2.01"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f07f6fbe494..52ca861ed35e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -44,6 +44,7 @@
#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
#define MAX_SERVER_SIZE 15
#define MAX_SHARE_SIZE 80
+#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */
#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
@@ -101,20 +102,14 @@ enum statusEnum {
};
enum securityEnum {
- LANMAN = 0, /* Legacy LANMAN auth */
+ Unspecified = 0, /* not specified */
+ LANMAN, /* Legacy LANMAN auth */
NTLM, /* Legacy NTLM012 auth with NTLM hash */
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
-/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
Kerberos, /* Kerberos via SPNEGO */
};
-enum protocolEnum {
- TCP = 0,
- SCTP
- /* Netbios frames protocol not supported at this time */
-};
-
struct session_key {
unsigned int len;
char *response;
@@ -131,9 +126,11 @@ struct cifs_secmech {
struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
struct crypto_shash *md5; /* md5 hash function */
struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+ struct crypto_shash *cmacaes; /* block-cipher based MAC function */
struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
+ struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
};
/* per smb session structure/fields */
@@ -181,6 +178,7 @@ enum smb_version {
Smb_20,
Smb_21,
Smb_30,
+ Smb_302,
};
struct mid_q_entry;
@@ -197,6 +195,7 @@ struct cifs_writedata;
struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
+struct cifs_open_parms;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, void *,
@@ -228,6 +227,7 @@ struct smb_version_operations {
void (*dump_detail)(void *);
void (*clear_stats)(struct cifs_tcon *);
void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+ void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
/* verify the message */
int (*check_message)(char *, unsigned int);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -309,9 +309,8 @@ struct smb_version_operations {
const char *, const char *,
struct cifs_sb_info *);
/* open a file for non-posix mounts */
- int (*open)(const unsigned int, struct cifs_tcon *, const char *, int,
- int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *,
- struct cifs_sb_info *);
+ int (*open)(const unsigned int, struct cifs_open_parms *,
+ __u32 *, FILE_ALL_INFO *);
/* set fid protocol-specific info */
void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
/* close a file */
@@ -367,8 +366,13 @@ struct smb_version_operations {
void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *fid);
+ /* The next two functions will need to be changed to per smb session */
+ void (*generate_signingkey)(struct TCP_Server_Info *server);
int (*calc_signature)(struct smb_rqst *rqst,
struct TCP_Server_Info *server);
+ int (*query_mf_symlink)(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid);
};
struct smb_version_values {
@@ -387,6 +391,8 @@ struct smb_version_values {
unsigned int cap_nt_find;
unsigned int cap_large_files;
unsigned int oplock_read;
+ __u16 signing_enabled;
+ __u16 signing_required;
};
#define HEADER_SIZE(server) (server->vals->header_size)
@@ -407,7 +413,8 @@ struct smb_vol {
kgid_t backupgid;
umode_t file_mode;
umode_t dir_mode;
- unsigned secFlg;
+ enum securityEnum sectype; /* sectype requested via mnt opts */
+ bool sign; /* was signing requested via mnt opts? */
bool retry:1;
bool intr:1;
bool setuids:1;
@@ -441,6 +448,7 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
+ bool nosharesock;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -514,6 +522,7 @@ struct TCP_Server_Info {
struct task_struct *tsk;
char server_GUID[16];
__u16 sec_mode;
+ bool sign; /* is signing enabled on this connection? */
bool session_estab; /* mark when very first sess is established */
#ifdef CONFIG_CIFS_SMB2
int echo_credits; /* echo reserved slots */
@@ -521,7 +530,6 @@ struct TCP_Server_Info {
bool echoes:1; /* enable echoes */
#endif
u16 dialect; /* dialect index that server chose */
- enum securityEnum secType;
bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
/* than maxReq distinct unanswered SMBs to the server when using */
@@ -540,12 +548,17 @@ struct TCP_Server_Info {
int timeAdj; /* Adjust for difference in server time zone in sec */
__u64 CurrentMid; /* multiplex id - rotating counter */
char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+ char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
+#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
+#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
+ char negflavor; /* NEGOTIATE response flavor */
/* extended security flavors that server supports */
bool sec_ntlmssp; /* supports NTLMSSP */
bool sec_kerberosu2u; /* supports U2U Kerberos */
@@ -697,7 +710,6 @@ struct cifs_ses {
enum statusEnum status;
unsigned overrideSecFlg; /* if non-zero override global sec flags */
__u16 ipc_tid; /* special tid for connection to IPC share */
- __u16 flags;
__u16 vcnum;
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -714,21 +726,14 @@ struct cifs_ses {
char *password;
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+ enum securityEnum sectype; /* what security flavor was specified? */
+ bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
#ifdef CONFIG_CIFS_SMB2
__u16 session_flags;
#endif /* CONFIG_CIFS_SMB2 */
};
-/* no more than one of the following three session flags may be set */
-#define CIFS_SES_NT4 1
-#define CIFS_SES_OS2 2
-#define CIFS_SES_W9X 4
-/* following flag is set for old servers such as OS2 (and Win95?)
- which do not negotiate NTLM or POSIX dialects, but instead
- negotiate one of the older LANMAN dialects */
-#define CIFS_SES_LANMAN 8
-
static inline bool
cap_unix(struct cifs_ses *ses)
{
@@ -816,7 +821,7 @@ struct cifs_tcon {
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
- __u32 capabilities;
+ __le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
__u32 vol_serial_number;
@@ -911,6 +916,17 @@ struct cifs_search_info {
bool smallBuf:1; /* so we know which buf_release function to call */
};
+struct cifs_open_parms {
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ int disposition;
+ int desired_access;
+ int create_options;
+ const char *path;
+ struct cifs_fid *fid;
+ bool reconnect:1;
+};
+
struct cifs_fid {
__u16 netfid;
#ifdef CONFIG_CIFS_SMB2
@@ -1348,7 +1364,7 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
-#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
@@ -1494,4 +1510,7 @@ extern struct smb_version_values smb21_values;
#define SMB30_VERSION_STRING "3.0"
extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING "3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e996ff6b26d1..11ca24a8e054 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
*/
#define CIFS_SESS_KEY_SIZE (16)
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
+
#define CIFS_CLIENT_CHALLENGE_SIZE (8)
#define CIFS_SERVER_CHALLENGE_SIZE (8)
#define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp {
#define READ_RAW_ENABLE 1
#define WRITE_RAW_ENABLE 2
#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
-
+#define SMB1_CLIENT_GUID_SIZE (16)
typedef struct negotiate_rsp {
struct smb_hdr hdr; /* wct = 17 */
__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -553,7 +558,7 @@ typedef struct negotiate_rsp {
/* followed by 16 bytes of server GUID */
/* then security blob if cap_extended_security negotiated */
struct {
- unsigned char GUID[16];
+ unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
unsigned char SecurityBlob[1];
} __attribute__((packed)) extended_response;
} __attribute__((packed)) u;
@@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
/* parms and data follow */
} __attribute__((packed)) NTRANSACT_RSP;
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+ __le64 SourceOffset;
+ __le64 DestinationOffset;
+ __le32 CopyLength;
+ __u32 Reserved;
+} __packed;
+
typedef struct smb_com_transaction_ioctl_req {
struct smb_hdr hdr; /* wct = 23 */
__u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index dda188a94332..b29a012bed33 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
struct cifs_ses *ses,
void **request_buf);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+ enum securityEnum requested);
extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
struct cifs_ses *ses);
extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -430,9 +433,9 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
const struct nls_table *);
extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
-extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
extern int calc_seckey(struct cifs_ses *);
+extern void generate_smb3signingkey(struct TCP_Server_Info *);
#ifdef CONFIG_CIFS_WEAK_PW_HASH
extern int calc_lanman_hash(const char *password, const char *cryptkey,
@@ -494,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work);
struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
-
+int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a58dc77cc443..a89c4cb4e6cf 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -367,6 +367,185 @@ vt2_err:
return -EINVAL;
}
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+ int rc = 0;
+ u16 count;
+ char *guid = pSMBr->u.extended_response.GUID;
+ struct TCP_Server_Info *server = ses->server;
+
+ count = get_bcc(&pSMBr->hdr);
+ if (count < SMB1_CLIENT_GUID_SIZE)
+ return -EIO;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ if (server->srv_count > 1) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+ cifs_dbg(FYI, "server UID changed\n");
+ memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+ }
+ } else {
+ spin_unlock(&cifs_tcp_ses_lock);
+ memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+ }
+
+ if (count == SMB1_CLIENT_GUID_SIZE) {
+ server->sec_ntlmssp = true;
+ } else {
+ count -= SMB1_CLIENT_GUID_SIZE;
+ rc = decode_negTokenInit(
+ pSMBr->u.extended_response.SecurityBlob, count, server);
+ if (rc != 1)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+ bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+ bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+ bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+
+ /*
+ * Is signing required by mnt options? If not then check
+ * global_secflags to see if it is there.
+ */
+ if (!mnt_sign_required)
+ mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+ CIFSSEC_MUST_SIGN);
+
+ /*
+ * If signing is required then it's automatically enabled too,
+ * otherwise, check to see if the secflags allow it.
+ */
+ mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+ (global_secflags & CIFSSEC_MAY_SIGN);
+
+ /* If server requires signing, does client allow it? */
+ if (srv_sign_required) {
+ if (!mnt_sign_enabled) {
+ cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+ return -ENOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ /* If client requires signing, does server allow it? */
+ if (mnt_sign_required) {
+ if (!srv_sign_enabled) {
+ cifs_dbg(VFS, "Server does not support signing!");
+ return -ENOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+ __s16 tmp;
+ struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+ if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+ return -EOPNOTSUPP;
+
+ server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+ server->maxReq = min_t(unsigned int,
+ le16_to_cpu(rsp->MaxMpxCount),
+ cifs_max_pending);
+ set_credits(server, server->maxReq);
+ server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+ server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
+ /* even though we do not use raw we might as well set this
+ accurately, in case we ever find a need for it */
+ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+ server->max_rw = 0xFF00;
+ server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+ } else {
+ server->max_rw = 0;/* do not need to use raw anyway */
+ server->capabilities = CAP_MPX_MODE;
+ }
+ tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+ if (tmp == -1) {
+ /* OS/2 often does not set timezone therefore
+ * we must use server time to calc time zone.
+ * Could deviate slightly from the right zone.
+ * Smallest defined timezone difference is 15 minutes
+ * (i.e. Nepal). Rounding up/down is done to match
+ * this requirement.
+ */
+ int val, seconds, remain, result;
+ struct timespec ts, utc;
+ utc = CURRENT_TIME;
+ ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+ rsp->SrvTime.Time, 0);
+ cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+ (int)ts.tv_sec, (int)utc.tv_sec,
+ (int)(utc.tv_sec - ts.tv_sec));
+ val = (int)(utc.tv_sec - ts.tv_sec);
+ seconds = abs(val);
+ result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+ remain = seconds % MIN_TZ_ADJ;
+ if (remain >= (MIN_TZ_ADJ / 2))
+ result += MIN_TZ_ADJ;
+ if (val < 0)
+ result = -result;
+ server->timeAdj = result;
+ } else {
+ server->timeAdj = (int)tmp;
+ server->timeAdj *= 60; /* also in seconds */
+ }
+ cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+
+
+ /* BB get server time for time conversions and add
+ code to use it and timezone since this is not UTC */
+
+ if (rsp->EncryptionKeyLength ==
+ cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+ memcpy(server->cryptkey, rsp->EncryptionKey,
+ CIFS_CRYPTO_KEY_SIZE);
+ } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+ return -EIO; /* need cryptkey unless plain text */
+ }
+
+ cifs_dbg(FYI, "LANMAN negotiated\n");
+ return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+ cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+ return -EOPNOTSUPP;
+}
+#endif
+
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+ switch (sectype) {
+ case RawNTLMSSP:
+ case Kerberos:
+ return true;
+ case Unspecified:
+ if (global_secflags &
+ (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+ return true;
+ /* Fallthrough */
+ default:
+ return false;
+ }
+}
+
int
CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
{
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
int rc = 0;
int bytes_returned;
int i;
- struct TCP_Server_Info *server;
+ struct TCP_Server_Info *server = ses->server;
u16 count;
- unsigned int secFlags;
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
+
rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
(void **) &pSMB, (void **) &pSMBr);
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
- else /* if override flags set only sign/seal OR them with global auth */
- secFlags = global_secflags | ses->overrideSecFlg;
-
- cifs_dbg(FYI, "secFlags 0x%x\n", secFlags);
-
pSMB->hdr.Mid = get_next_mid(server);
pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
- if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
- cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n");
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
- pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
- else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
- cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n");
+ if (should_set_ext_sec_flag(ses->sectype)) {
+ cifs_dbg(FYI, "Requesting extended security.");
pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
}
@@ -436,127 +598,21 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
could not negotiate a common dialect */
rc = -EOPNOTSUPP;
goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- } else if ((pSMBr->hdr.WordCount == 13)
- && ((server->dialect == LANMAN_PROT)
- || (server->dialect == LANMAN2_PROT))) {
- __s16 tmp;
- struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-
- if ((secFlags & CIFSSEC_MAY_LANMAN) ||
- (secFlags & CIFSSEC_MAY_PLNTXT))
- server->secType = LANMAN;
- else {
- cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n");
- rc = -EOPNOTSUPP;
- goto neg_err_exit;
- }
- server->sec_mode = le16_to_cpu(rsp->SecurityMode);
- server->maxReq = min_t(unsigned int,
- le16_to_cpu(rsp->MaxMpxCount),
- cifs_max_pending);
- set_credits(server, server->maxReq);
- server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
- server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
- /* even though we do not use raw we might as well set this
- accurately, in case we ever find a need for it */
- if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
- server->max_rw = 0xFF00;
- server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
- } else {
- server->max_rw = 0;/* do not need to use raw anyway */
- server->capabilities = CAP_MPX_MODE;
- }
- tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
- if (tmp == -1) {
- /* OS/2 often does not set timezone therefore
- * we must use server time to calc time zone.
- * Could deviate slightly from the right zone.
- * Smallest defined timezone difference is 15 minutes
- * (i.e. Nepal). Rounding up/down is done to match
- * this requirement.
- */
- int val, seconds, remain, result;
- struct timespec ts, utc;
- utc = CURRENT_TIME;
- ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
- rsp->SrvTime.Time, 0);
- cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
- (int)ts.tv_sec, (int)utc.tv_sec,
- (int)(utc.tv_sec - ts.tv_sec));
- val = (int)(utc.tv_sec - ts.tv_sec);
- seconds = abs(val);
- result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
- remain = seconds % MIN_TZ_ADJ;
- if (remain >= (MIN_TZ_ADJ / 2))
- result += MIN_TZ_ADJ;
- if (val < 0)
- result = -result;
- server->timeAdj = result;
- } else {
- server->timeAdj = (int)tmp;
- server->timeAdj *= 60; /* also in seconds */
- }
- cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
-
-
- /* BB get server time for time conversions and add
- code to use it and timezone since this is not UTC */
-
- if (rsp->EncryptionKeyLength ==
- cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
- memcpy(ses->server->cryptkey, rsp->EncryptionKey,
- CIFS_CRYPTO_KEY_SIZE);
- } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
- rc = -EIO; /* need cryptkey unless plain text */
- goto neg_err_exit;
- }
-
- cifs_dbg(FYI, "LANMAN negotiated\n");
- /* we will not end up setting signing flags - as no signing
- was in LANMAN and server did not return the flags on */
- goto signing_check;
-#else /* weak security disabled */
} else if (pSMBr->hdr.WordCount == 13) {
- cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
- rc = -EOPNOTSUPP;
-#endif /* WEAK_PW_HASH */
- goto neg_err_exit;
+ server->negflavor = CIFS_NEGFLAVOR_LANMAN;
+ rc = decode_lanman_negprot_rsp(server, pSMBr);
+ goto signing_check;
} else if (pSMBr->hdr.WordCount != 17) {
/* unknown wct */
rc = -EOPNOTSUPP;
goto neg_err_exit;
}
- /* else wct == 17 NTLM */
+ /* else wct == 17, NTLM or better */
+
server->sec_mode = pSMBr->SecurityMode;
if ((server->sec_mode & SECMODE_USER) == 0)
cifs_dbg(FYI, "share mode security\n");
- if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
-#endif /* CIFS_WEAK_PW_HASH */
- cifs_dbg(VFS, "Server requests plain text password but client support disabled\n");
-
- if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
- server->secType = NTLMv2;
- else if (secFlags & CIFSSEC_MAY_NTLM)
- server->secType = NTLM;
- else if (secFlags & CIFSSEC_MAY_NTLMV2)
- server->secType = NTLMv2;
- else if (secFlags & CIFSSEC_MAY_KRB5)
- server->secType = Kerberos;
- else if (secFlags & CIFSSEC_MAY_NTLMSSP)
- server->secType = RawNTLMSSP;
- else if (secFlags & CIFSSEC_MAY_LANMAN)
- server->secType = LANMAN;
- else {
- rc = -EOPNOTSUPP;
- cifs_dbg(VFS, "Invalid security type\n");
- goto neg_err_exit;
- }
- /* else ... any others ...? */
-
/* one byte, so no need to convert this or EncryptionKeyLen from
little endian */
server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
@@ -569,90 +625,26 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
server->timeAdj *= 60;
+
if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+ server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
CIFS_CRYPTO_KEY_SIZE);
} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
server->capabilities & CAP_EXTENDED_SECURITY) &&
(pSMBr->EncryptionKeyLength == 0)) {
- /* decode security blob */
- count = get_bcc(&pSMBr->hdr);
- if (count < 16) {
- rc = -EIO;
- goto neg_err_exit;
- }
- spin_lock(&cifs_tcp_ses_lock);
- if (server->srv_count > 1) {
- spin_unlock(&cifs_tcp_ses_lock);
- if (memcmp(server->server_GUID,
- pSMBr->u.extended_response.
- GUID, 16) != 0) {
- cifs_dbg(FYI, "server UID changed\n");
- memcpy(server->server_GUID,
- pSMBr->u.extended_response.GUID,
- 16);
- }
- } else {
- spin_unlock(&cifs_tcp_ses_lock);
- memcpy(server->server_GUID,
- pSMBr->u.extended_response.GUID, 16);
- }
-
- if (count == 16) {
- server->secType = RawNTLMSSP;
- } else {
- rc = decode_negTokenInit(pSMBr->u.extended_response.
- SecurityBlob, count - 16,
- server);
- if (rc == 1)
- rc = 0;
- else
- rc = -EINVAL;
- if (server->secType == Kerberos) {
- if (!server->sec_kerberos &&
- !server->sec_mskerberos)
- rc = -EOPNOTSUPP;
- } else if (server->secType == RawNTLMSSP) {
- if (!server->sec_ntlmssp)
- rc = -EOPNOTSUPP;
- } else
- rc = -EOPNOTSUPP;
- }
+ server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+ rc = decode_ext_sec_blob(ses, pSMBr);
} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
rc = -EIO; /* no crypt key only if plain text pwd */
- goto neg_err_exit;
- } else
- server->capabilities &= ~CAP_EXTENDED_SECURITY;
-
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-signing_check:
-#endif
- if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
- /* MUST_SIGN already includes the MAY_SIGN FLAG
- so if this is zero it means that signing is disabled */
- cifs_dbg(FYI, "Signing disabled\n");
- if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
- cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
- rc = -EOPNOTSUPP;
- }
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
- /* signing required */
- cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags);
- if ((server->sec_mode &
- (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
- cifs_dbg(VFS, "signing required but server lacks support\n");
- rc = -EOPNOTSUPP;
- } else
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
} else {
- /* signing optional ie CIFSSEC_MAY_SIGN */
- if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+ server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+ server->capabilities &= ~CAP_EXTENDED_SECURITY;
}
+signing_check:
+ if (!rc)
+ rc = cifs_enable_signing(server, ses->sign);
neg_err_exit:
cifs_buf_release(pSMB);
@@ -777,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
pSMB->hdr.Mid = get_next_mid(ses->server);
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+ if (ses->server->sign)
+ pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
pSMB->hdr.Uid = ses->Suid;
@@ -1540,8 +1531,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
/* result already set, check signature */
- if (server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
int rc = 0;
rc = cifs_verify_signature(&rqst, server,
@@ -3940,6 +3930,7 @@ QFileInfoRetry:
pSMB->Pad = 0;
pSMB->Fid = netfid;
inc_rfc1001_len(pSMB, byte_count);
+ pSMB->t2.ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4108,6 +4099,7 @@ UnixQFileInfoRetry:
pSMB->Pad = 0;
pSMB->Fid = netfid;
inc_rfc1001_len(pSMB, byte_count);
+ pSMB->t2.ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4794,11 +4786,8 @@ getDFSRetry:
strncpy(pSMB->RequestFileName, search_name, name_len);
}
- if (ses->server) {
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- }
+ if (ses->server && ses->server->sign)
+ pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
pSMB->hdr.Uid = ses->Suid;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e3bc39bb9d12..d67c550c4980 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
Opt_acl, Opt_noacl, Opt_locallease,
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
- Opt_multiuser, Opt_sloppy,
+ Opt_multiuser, Opt_sloppy, Opt_nosharesock,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -165,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_mfsymlinks, "mfsymlinks" },
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
+ { Opt_nosharesock, "nosharesock" },
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -275,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
{ Smb_20, SMB20_VERSION_STRING},
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
+ { Smb_302, SMB302_VERSION_STRING },
};
static int ip_connect(struct TCP_Server_Info *server);
@@ -1024,44 +1026,48 @@ static int cifs_parse_security_flavors(char *value,
substring_t args[MAX_OPT_ARGS];
+ /*
+ * With mount options, the last one should win. Reset any existing
+ * settings back to default.
+ */
+ vol->sectype = Unspecified;
+ vol->sign = false;
+
switch (match_token(value, cifs_secflavor_tokens, args)) {
- case Opt_sec_krb5:
- vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
- break;
- case Opt_sec_krb5i:
- vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
- break;
case Opt_sec_krb5p:
- /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */
- cifs_dbg(VFS, "Krb5 cifs privacy not supported\n");
- break;
- case Opt_sec_ntlmssp:
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+ cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+ return 1;
+ case Opt_sec_krb5i:
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_krb5:
+ vol->sectype = Kerberos;
break;
case Opt_sec_ntlmsspi:
- vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN;
- break;
- case Opt_ntlm:
- /* ntlm is default so can be turned off too */
- vol->secFlg |= CIFSSEC_MAY_NTLM;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_ntlmssp:
+ vol->sectype = RawNTLMSSP;
break;
case Opt_sec_ntlmi:
- vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN;
- break;
- case Opt_sec_ntlmv2:
- vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_ntlm:
+ vol->sectype = NTLM;
break;
case Opt_sec_ntlmv2i:
- vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN;
+ vol->sign = true;
+ /* Fallthrough */
+ case Opt_sec_ntlmv2:
+ vol->sectype = NTLMv2;
break;
#ifdef CONFIG_CIFS_WEAK_PW_HASH
case Opt_sec_lanman:
- vol->secFlg |= CIFSSEC_MAY_LANMAN;
+ vol->sectype = LANMAN;
break;
#endif
case Opt_sec_none:
vol->nullauth = 1;
- vol->secFlg |= CIFSSEC_MAY_NTLM;
break;
default:
cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1119,6 +1125,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->ops = &smb30_operations;
vol->vals = &smb30_values;
break;
+ case Smb_302:
+ vol->ops = &smb30_operations; /* currently identical with 3.0 */
+ vol->vals = &smb302_values;
+ break;
#endif
default:
cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -1424,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->local_lease = 1;
break;
case Opt_sign:
- vol->secFlg |= CIFSSEC_MUST_SIGN;
+ vol->sign = true;
break;
case Opt_seal:
/* we do not do the following in secFlags because seal
@@ -1455,6 +1465,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_sloppy:
sloppy = true;
break;
+ case Opt_nosharesock:
+ vol->nosharesock = true;
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -1662,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (string == NULL)
goto out_nomem;
- if (strnlen(string, 256) == 256) {
+ if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
+ == CIFS_MAX_DOMAINNAME_LEN) {
printk(KERN_WARNING "CIFS: domain name too"
" long\n");
goto cifs_parse_mount_err;
@@ -1978,47 +1992,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
static bool
match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
{
- unsigned int secFlags;
-
- if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- secFlags = vol->secFlg;
- else
- secFlags = global_secflags | vol->secFlg;
-
- switch (server->secType) {
- case LANMAN:
- if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
- return false;
- break;
- case NTLMv2:
- if (!(secFlags & CIFSSEC_MAY_NTLMV2))
- return false;
- break;
- case NTLM:
- if (!(secFlags & CIFSSEC_MAY_NTLM))
- return false;
- break;
- case Kerberos:
- if (!(secFlags & CIFSSEC_MAY_KRB5))
- return false;
- break;
- case RawNTLMSSP:
- if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
- return false;
- break;
- default:
- /* shouldn't happen */
+ /*
+ * The select_sectype function should either return the vol->sectype
+ * that was specified, or "Unspecified" if that sectype was not
+ * compatible with the given NEGOTIATE request.
+ */
+ if (select_sectype(server, vol->sectype) == Unspecified)
return false;
- }
- /* now check if signing mode is acceptable */
- if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
- (server->sec_mode & SECMODE_SIGN_REQUIRED))
- return false;
- else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
- (server->sec_mode &
- (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
- return false;
+ /*
+ * Now check if signing mode is acceptable. No need to check
+ * global_secflags at this point since if MUST_SIGN is set then
+ * the server->sign had better be too.
+ */
+ if (vol->sign && !server->sign)
+ return false;
return true;
}
@@ -2027,6 +2015,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
{
struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+ if (vol->nosharesock)
+ return 0;
+
if ((server->vals != vol->vals) || (server->ops != vol->ops))
return 0;
@@ -2118,12 +2109,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
goto out_err;
}
- rc = cifs_crypto_shash_allocate(tcp_ses);
- if (rc) {
- cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
- goto out_err;
- }
-
tcp_ses->ops = volume_info->ops;
tcp_ses->vals = volume_info->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
@@ -2216,7 +2201,11 @@ out_err:
static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
{
- switch (ses->server->secType) {
+ if (vol->sectype != Unspecified &&
+ vol->sectype != ses->sectype)
+ return 0;
+
+ switch (ses->sectype) {
case Kerberos:
if (!uid_eq(vol->cred_uid, ses->cred_uid))
return 0;
@@ -2288,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses)
#ifdef CONFIG_KEYS
-/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
-#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
/* Populate username and pw fields from keyring if possible */
static int
@@ -2493,7 +2482,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->cred_uid = volume_info->cred_uid;
ses->linux_uid = volume_info->linux_uid;
- ses->overrideSecFlg = volume_info->secFlg;
+ ses->sectype = volume_info->sectype;
+ ses->sign = volume_info->sign;
mutex_lock(&ses->session_mutex);
rc = cifs_negotiate_protocol(xid, ses);
@@ -3656,7 +3646,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
NTLMv2 password here) */
#ifdef CONFIG_CIFS_WEAK_PW_HASH
if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
- (ses->server->secType == LANMAN))
+ (ses->sectype == LANMAN))
calc_lanman_hash(tcon->password, ses->server->cryptkey,
ses->server->sec_mode &
SECMODE_PW_ENCRYPT ? true : false,
@@ -3674,8 +3664,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
}
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
if (ses->capabilities & CAP_STATUS32) {
@@ -3738,7 +3727,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+ strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -3827,7 +3816,6 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
int rc = -ENOSYS;
struct TCP_Server_Info *server = ses->server;
- ses->flags = 0;
ses->capabilities = server->capabilities;
if (linuxExtEnabled == 0)
ses->capabilities &= (~server->vals->cap_unix);
@@ -3848,6 +3836,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
server->sequence_number = 0x2;
server->session_estab = true;
ses->auth_key.response = NULL;
+ if (server->ops->generate_signingkey)
+ server->ops->generate_signingkey(server);
}
mutex_unlock(&server->srv_mutex);
@@ -3870,23 +3860,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
static int
cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
{
- switch (ses->server->secType) {
- case Kerberos:
- vol->secFlg = CIFSSEC_MUST_KRB5;
+ vol->sectype = ses->sectype;
+
+ /* krb5 is special, since we don't need username or pw */
+ if (vol->sectype == Kerberos)
return 0;
- case NTLMv2:
- vol->secFlg = CIFSSEC_MUST_NTLMV2;
- break;
- case NTLM:
- vol->secFlg = CIFSSEC_MUST_NTLM;
- break;
- case RawNTLMSSP:
- vol->secFlg = CIFSSEC_MUST_NTLMSSP;
- break;
- case LANMAN:
- vol->secFlg = CIFSSEC_MUST_LANMAN;
- break;
- }
return cifs_set_cifscreds(vol, ses);
}
@@ -3912,6 +3890,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->nocase = master_tcon->nocase;
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
+ vol_info->sectype = master_tcon->ses->sectype;
+ vol_info->sign = master_tcon->ses->sign;
rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5175aebf6737..d62ce0d48141 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,6 +204,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
struct inode *newinode = NULL;
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_open_parms oparms;
*oplock = 0;
if (tcon->ses->server->oplocks)
@@ -319,9 +320,16 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, fid, oplock,
- buf, cifs_sb);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
goto out;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0630710a9c3f..7e36ae34e947 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -183,6 +183,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
int create_options = CREATE_NOT_DIR;
FILE_ALL_INFO *buf;
struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_open_parms oparms;
if (!server->ops->open)
return -ENOSYS;
@@ -224,9 +225,16 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, fid, oplock, buf,
- cifs_sb);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc)
goto out;
@@ -553,11 +561,10 @@ cifs_relock_file(struct cifsFileInfo *cfile)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
- /* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ down_read(&cinode->lock_sem);
if (cinode->can_cache_brlcks) {
- /* can cache locks - no need to push them */
- up_write(&cinode->lock_sem);
+ /* can cache locks - no need to relock */
+ up_read(&cinode->lock_sem);
return rc;
}
@@ -568,7 +575,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
else
rc = tcon->ses->server->ops->push_mand_locks(cfile);
- up_write(&cinode->lock_sem);
+ up_read(&cinode->lock_sem);
return rc;
}
@@ -587,7 +594,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
int desired_access;
int disposition = FILE_OPEN;
int create_options = CREATE_NOT_DIR;
- struct cifs_fid fid;
+ struct cifs_open_parms oparms;
xid = get_xid();
mutex_lock(&cfile->fh_mutex);
@@ -637,9 +644,10 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
rc = cifs_posix_open(full_path, NULL, inode->i_sb,
cifs_sb->mnt_file_mode /* ignored */,
- oflags, &oplock, &fid.netfid, xid);
+ oflags, &oplock, &cfile->fid.netfid, xid);
if (rc == 0) {
cifs_dbg(FYI, "posix reopen succeeded\n");
+ oparms.reconnect = true;
goto reopen_success;
}
/*
@@ -654,7 +662,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
create_options |= CREATE_OPEN_BACKUP_INTENT;
if (server->ops->get_lease_key)
- server->ops->get_lease_key(inode, &fid);
+ server->ops->get_lease_key(inode, &cfile->fid);
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = desired_access;
+ oparms.create_options = create_options;
+ oparms.disposition = disposition;
+ oparms.path = full_path;
+ oparms.fid = &cfile->fid;
+ oparms.reconnect = true;
/*
* Can not refresh inode by passing in file_info buf to be returned by
@@ -663,9 +680,14 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
* version of file size can be stale. If we knew for sure that inode was
* not dirty locally we could do this.
*/
- rc = server->ops->open(xid, tcon, full_path, disposition,
- desired_access, create_options, &fid, &oplock,
- NULL, cifs_sb);
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ if (rc == -ENOENT && oparms.reconnect == false) {
+ /* durable handle timeout is expired - open the file again */
+ rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ /* indicate that we need to relock the file */
+ oparms.reconnect = true;
+ }
+
if (rc) {
mutex_unlock(&cfile->fh_mutex);
cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
@@ -696,8 +718,9 @@ reopen_success:
* to the server to get the new inode info.
*/
- server->ops->set_fid(cfile, &fid, oplock);
- cifs_relock_file(cfile);
+ server->ops->set_fid(cfile, &cfile->fid, oplock);
+ if (oparms.reconnect)
+ cifs_relock_file(cfile);
reopen_error_exit:
kfree(full_path);
@@ -3547,11 +3570,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return cifs_fscache_release_page(page, gfp);
}
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20efd81266c6..449b6cf09b09 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -558,6 +558,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
fattr->cf_mode &= ~(S_IWUGO);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ if (fattr->cf_nlink < 1) {
+ cifs_dbg(1, "replacing bogus file nlink value %u\n",
+ fattr->cf_nlink);
+ fattr->cf_nlink = 1;
+ }
}
fattr->cf_uid = cifs_sb->mnt_uid;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b83c3f5646bd..562044f700e5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
}
int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
- const unsigned char *path,
- struct cifs_sb_info *cifs_sb, unsigned int xid)
+open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid)
{
int rc;
int oplock = 0;
__u16 netfid = 0;
struct tcon_link *tlink;
- struct cifs_tcon *pTcon;
+ struct cifs_tcon *ptcon;
struct cifs_io_parms io_parms;
- u8 *buf;
- char *pbuf;
- unsigned int bytes_read = 0;
int buf_type = CIFS_NO_BUFFER;
- unsigned int link_len = 0;
FILE_ALL_INFO file_info;
- if (!CIFSCouldBeMFSymlink(fattr))
- /* it's not a symlink */
- return 0;
-
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- pTcon = tlink_tcon(tlink);
+ ptcon = tlink_tcon(tlink);
- rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+ rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
CREATE_NOT_DIR, &netfid, &oplock, &file_info,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc != 0)
- goto out;
+ if (rc != 0) {
+ cifs_put_tlink(tlink);
+ return rc;
+ }
if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
- CIFSSMBClose(xid, pTcon, netfid);
+ CIFSSMBClose(xid, ptcon, netfid);
+ cifs_put_tlink(tlink);
/* it's not a symlink */
- goto out;
+ return rc;
}
- buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
- if (!buf) {
- rc = -ENOMEM;
- goto out;
- }
- pbuf = buf;
io_parms.netfid = netfid;
io_parms.pid = current->tgid;
- io_parms.tcon = pTcon;
+ io_parms.tcon = ptcon;
io_parms.offset = 0;
io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
- CIFSSMBClose(xid, pTcon, netfid);
- if (rc != 0) {
- kfree(buf);
+ rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+ CIFSSMBClose(xid, ptcon, netfid);
+ cifs_put_tlink(tlink);
+ return rc;
+}
+
+
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+ const unsigned char *path,
+ struct cifs_sb_info *cifs_sb, unsigned int xid)
+{
+ int rc = 0;
+ u8 *buf = NULL;
+ unsigned int link_len = 0;
+ unsigned int bytes_read = 0;
+ struct cifs_tcon *ptcon;
+
+ if (!CIFSCouldBeMFSymlink(fattr))
+ /* it's not a symlink */
+ return 0;
+
+ buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
goto out;
}
+ ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+ if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
+ rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
+ &bytes_read, cifs_sb, xid);
+ else
+ goto out;
+
+ if (rc != 0)
+ goto out;
+
+ if (bytes_read == 0) /* not a symlink */
+ goto out;
+
rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
- kfree(buf);
if (rc == -EINVAL) {
/* it's not a symlink */
rc = 0;
@@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
fattr->cf_dtype = DT_LNK;
out:
- cifs_put_tlink(tlink);
+ kfree(buf);
return rc;
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1bec014779fd..f7d4b2285efe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
if (treeCon->nocase)
buffer->Flags |= SMBFLG_CASELESS;
if ((treeCon->ses) && (treeCon->ses->server))
- if (treeCon->ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (treeCon->ses->server->sign)
buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f1213799de1a..69d2c826a23b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
return;
}
+ /*
+ * If we know that the inode will need to be revalidated immediately,
+ * then don't create a new dentry for it. We'll end up doing an on
+ * the wire call either way and this spares us an invalidation.
+ */
+ if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+ return;
+
dentry = d_alloc(parent, name);
if (!dentry)
return;
@@ -126,6 +134,22 @@ out:
dput(dentry);
}
+/*
+ * Is it possible that this directory might turn out to be a DFS referral
+ * once we go to try and use it?
+ */
+static bool
+cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+ return true;
+#endif
+ return false;
+}
+
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -135,6 +159,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
+ /*
+ * Windows CIFS servers generally make DFS referrals look
+ * like directories in FIND_* responses with the reparse
+ * attribute flag also set (since DFS junctions are
+ * reparse points). We must revalidate at least these
+ * directory inodes before trying to use them (if
+ * they are DFS we will get PATH_NOT_COVERED back
+ * when queried directly and can then try to connect
+ * to the DFS target)
+ */
+ if (cifs_dfs_is_possible(cifs_sb) &&
+ (fattr->cf_cifsattrs & ATTR_REPARSE))
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index f230571a7ab3..08dd37bb23aa 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (ses->server->sign)
pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
if (ses->capabilities & CAP_UNICODE) {
@@ -198,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
bytes_ret = 0;
} else
bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
- 256, nls_cp);
+ CIFS_MAX_DOMAINNAME_LEN, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null terminator */
@@ -256,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* copy domain */
if (ses->domainName != NULL) {
- strncpy(bcc_ptr, ses->domainName, 256);
- bcc_ptr += strnlen(ses->domainName, 256);
+ strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
} /* else we will send a null domain name
so the server will default to its own domain */
*bcc_ptr = 0;
@@ -310,11 +309,10 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
}
-static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
- struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+ struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
{
- int rc = 0;
int len;
char *bcc_ptr = *pbcc_area;
@@ -322,24 +320,22 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
len = strnlen(bcc_ptr, bleft);
if (len >= bleft)
- return rc;
+ return;
kfree(ses->serverOS);
ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
if (ses->serverOS)
strncpy(ses->serverOS, bcc_ptr, len);
- if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
+ if (strncmp(ses->serverOS, "OS/2", 4) == 0)
cifs_dbg(FYI, "OS/2 server\n");
- ses->flags |= CIFS_SES_OS2;
- }
bcc_ptr += len + 1;
bleft -= len + 1;
len = strnlen(bcc_ptr, bleft);
if (len >= bleft)
- return rc;
+ return;
kfree(ses->serverNOS);
@@ -352,7 +348,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
len = strnlen(bcc_ptr, bleft);
if (len > bleft)
- return rc;
+ return;
/* No domain field in LANMAN case. Domain is
returned by old servers in the SMB negprot response */
@@ -360,8 +356,6 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
but thus do return domain here we could add parsing
for it later, but it is not very important */
cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
-
- return rc;
}
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -432,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (ses->server->sign) {
flags |= NTLMSSP_NEGOTIATE_SIGN;
if (!ses->server->session_estab)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -471,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
- if (ses->server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (ses->server->sign) {
flags |= NTLMSSP_NEGOTIATE_SIGN;
if (!ses->server->session_estab)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -558,6 +550,56 @@ setup_ntlmv2_ret:
return rc;
}
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+ switch (server->negflavor) {
+ case CIFS_NEGFLAVOR_EXTENDED:
+ switch (requested) {
+ case Kerberos:
+ case RawNTLMSSP:
+ return requested;
+ case Unspecified:
+ if (server->sec_ntlmssp &&
+ (global_secflags & CIFSSEC_MAY_NTLMSSP))
+ return RawNTLMSSP;
+ if ((server->sec_kerberos || server->sec_mskerberos) &&
+ (global_secflags & CIFSSEC_MAY_KRB5))
+ return Kerberos;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ case CIFS_NEGFLAVOR_UNENCAP:
+ switch (requested) {
+ case NTLM:
+ case NTLMv2:
+ return requested;
+ case Unspecified:
+ if (global_secflags & CIFSSEC_MAY_NTLMV2)
+ return NTLMv2;
+ if (global_secflags & CIFSSEC_MAY_NTLM)
+ return NTLM;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ case CIFS_NEGFLAVOR_LANMAN:
+ switch (requested) {
+ case LANMAN:
+ return requested;
+ case Unspecified:
+ if (global_secflags & CIFSSEC_MAY_LANMAN)
+ return LANMAN;
+ /* Fallthrough */
+ default:
+ return Unspecified;
+ }
+ default:
+ return Unspecified;
+ }
+}
+
int
CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp)
@@ -579,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
u16 blob_len;
char *ntlmsspblob = NULL;
- if (ses == NULL)
+ if (ses == NULL) {
+ WARN(1, "%s: ses == NULL!", __func__);
return -EINVAL;
+ }
- type = ses->server->secType;
+ type = select_sectype(ses->server, ses->sectype);
cifs_dbg(FYI, "sess setup type %d\n", type);
+ if (type == Unspecified) {
+ cifs_dbg(VFS, "Unable to select appropriate authentication method!");
+ return -EINVAL;
+ }
+
if (type == RawNTLMSSP) {
/* if memory allocation is successful, caller of this function
* frees it.
@@ -643,8 +692,6 @@ ssetup_ntlmssp_authenticate:
}
bcc_ptr = str_area;
- ses->flags &= ~CIFS_SES_LANMAN;
-
iov[1].iov_base = NULL;
iov[1].iov_len = 0;
@@ -668,7 +715,6 @@ ssetup_ntlmssp_authenticate:
ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
true : false, lnm_session_key);
- ses->flags |= CIFS_SES_LANMAN;
memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
bcc_ptr += CIFS_AUTH_RESP_SIZE;
@@ -938,8 +984,7 @@ ssetup_ntlmssp_authenticate:
}
decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
} else {
- rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
- ses, nls_cp);
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
}
ssetup_exit:
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3efdb9d5c0b8..60943978aec3 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -449,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
* WRITEX header, not including the 4 byte RFC1001 length.
*/
if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
- (!(server->capabilities & CAP_UNIX) &&
- (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
+ (!(server->capabilities & CAP_UNIX) && server->sign))
wsize = min_t(unsigned int, wsize,
server->maxBuf - sizeof(WRITE_REQ) + 4);
@@ -675,20 +674,23 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
}
static int
-cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
- int disposition, int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
- struct cifs_sb_info *cifs_sb)
-{
- if (!(tcon->ses->capabilities & CAP_NT_SMBS))
- return SMBLegacyOpen(xid, tcon, path, disposition,
- desired_access, create_options,
- &fid->netfid, oplock, buf,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf)
+{
+ if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
+ return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition,
+ oparms->desired_access,
+ oparms->create_options,
+ &oparms->fid->netfid, oplock, buf,
+ oparms->cifs_sb->local_nls,
+ oparms->cifs_sb->mnt_cifs_flags
& CIFS_MOUNT_MAP_SPECIAL_CHR);
- return CIFSSMBOpen(xid, tcon, path, disposition, desired_access,
- create_options, &fid->netfid, oplock, buf,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition, oparms->desired_access,
+ oparms->create_options, &oparms->fid->netfid, oplock,
+ buf, oparms->cifs_sb->local_nls,
+ oparms->cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
}
@@ -765,20 +767,14 @@ smb_set_file_info(struct inode *inode, const char *full_path,
}
tcon = tlink_tcon(tlink);
- /*
- * NT4 apparently returns success on this call, but it doesn't really
- * work.
- */
- if (!(tcon->ses->flags & CIFS_SES_NT4)) {
- rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
- cifs_sb->local_nls,
+ rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
- goto out;
- } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
- goto out;
+ if (rc == 0) {
+ cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+ goto out;
+ } else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
+ goto out;
}
cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
@@ -948,6 +944,7 @@ struct smb_version_operations smb1_operations = {
.mand_lock = cifs_mand_lock,
.mand_unlock_range = cifs_unlock_range,
.push_mand_locks = cifs_push_mandatory_locks,
+ .query_mf_symlink = open_query_close_cifs_symlink,
};
struct smb_version_values smb1_values = {
@@ -964,4 +961,6 @@ struct smb_version_values smb1_values = {
.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
.cap_large_files = CAP_LARGE_FILES,
.oplock_read = OPLOCK_READ,
+ .signing_enabled = SECMODE_SIGN_ENABLED,
+ .signing_required = SECMODE_SIGN_REQUIRED,
};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 5da1b55a2258..04a81a4142c3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -40,7 +40,8 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
oplock &= 0xFF;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
- if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+ if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+ oplock == SMB2_OPLOCK_LEVEL_BATCH) {
cinode->clientCanCacheAll = true;
cinode->clientCanCacheRead = true;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
@@ -57,17 +58,16 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
}
int
-smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
- int disposition, int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
- struct cifs_sb_info *cifs_sb)
+smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf)
{
int rc;
__le16 *smb2_path;
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
+ struct cifs_fid *fid = oparms->fid;
- smb2_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
rc = -ENOMEM;
goto out;
@@ -80,21 +80,19 @@ smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
goto out;
}
- desired_access |= FILE_READ_ATTRIBUTES;
- *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
- if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+ if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
- rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid,
- &fid->volatile_fid, desired_access, disposition,
- 0, 0, smb2_oplock, smb2_data);
+ rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
if (rc)
goto out;
if (buf) {
/* open response does not have IndexNumber field - get it */
- rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid,
+ rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid,
&smb2_data->IndexNumber);
if (rc) {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..c38350851b08 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
#define SMB2_SIGNATURE_SIZE (16)
#define SMB2_NTLMV2_SESSKEY_SIZE (16)
#define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fff6dfba6204..c6ec1633309a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -41,21 +41,26 @@ static int
smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition,
- __u32 file_attributes, __u32 create_options,
- void *data, int command)
+ __u32 create_options, void *data, int command)
{
int rc, tmprc = 0;
- u64 persistent_fid, volatile_fid;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- desired_access, create_disposition, file_attributes,
- create_options, &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
if (rc) {
kfree(utf16_path);
return rc;
@@ -65,8 +70,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
case SMB2_OP_DELETE:
break;
case SMB2_OP_QUERY_INFO:
- tmprc = SMB2_query_info(xid, tcon, persistent_fid,
- volatile_fid,
+ tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid,
(struct smb2_file_all_info *)data);
break;
case SMB2_OP_MKDIR:
@@ -76,19 +81,21 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
*/
break;
case SMB2_OP_RENAME:
- tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid,
- (__le16 *)data);
+ tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, (__le16 *)data);
break;
case SMB2_OP_HARDLINK:
- tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid,
- volatile_fid, (__le16 *)data);
+ tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, (__le16 *)data);
break;
case SMB2_OP_SET_EOF:
- tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, (__le64 *)data);
+ tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, current->tgid,
+ (__le64 *)data);
break;
case SMB2_OP_SET_INFO:
- tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid,
+ tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid,
(FILE_BASIC_INFO *)data);
break;
default:
@@ -96,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
break;
}
- rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (tmprc)
rc = tmprc;
kfree(utf16_path);
@@ -129,8 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return -ENOMEM;
rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0,
- smb2_data, SMB2_OP_QUERY_INFO);
+ FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
+ SMB2_OP_QUERY_INFO);
if (rc)
goto out;
@@ -145,7 +152,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
}
@@ -164,7 +171,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
@@ -175,7 +182,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+ CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
NULL, SMB2_OP_DELETE);
}
@@ -184,7 +191,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- 0, CREATE_DELETE_ON_CLOSE, NULL,
+ CREATE_DELETE_ON_CLOSE, NULL,
SMB2_OP_DELETE);
}
@@ -203,7 +210,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, 0, smb2_to_name, command);
+ FILE_OPEN, 0, smb2_to_name, command);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -234,7 +241,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
{
__le64 eof = cpu_to_le64(size);
return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof,
+ FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
SMB2_OP_SET_EOF);
}
@@ -250,7 +257,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf,
+ FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
SMB2_OP_SET_INFO);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 10383d8c015b..b0c43345cd98 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -266,6 +266,10 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
break;
case SMB2_IOCTL:
+ *off = le32_to_cpu(
+ ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+ *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+ break;
case SMB2_CHANGE_NOTIFY:
default:
/* BB FIXME for unimplemented cases above */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f2e76f3b0c61..f259e6cc8357 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -213,22 +213,29 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
{
int rc;
- __u64 persistent_fid, volatile_fid;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
if (rc) {
kfree(utf16_path);
return rc;
}
- rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
kfree(utf16_path);
return rc;
}
@@ -281,6 +288,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
}
static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+ seq_puts(m, "\n\tShare Capabilities:");
+ if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+ seq_puts(m, " DFS,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ seq_puts(m, " CONTINUOUS AVAILABILITY,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+ seq_puts(m, " SCALEOUT,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+ seq_puts(m, " CLUSTER,");
+ if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+ seq_puts(m, " ASYMMETRIC,");
+ if (tcon->capabilities == 0)
+ seq_puts(m, " None");
+ seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+}
+
+static void
smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
{
#ifdef CONFIG_CIFS_STATS
@@ -292,7 +318,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, "\nSessionSetups: %d sent %d failed",
atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
-#define SMB2LOGOFF 0x0002 /* trivial request/resp */
seq_printf(m, "\nLogoffs: %d sent %d failed",
atomic_read(&sent[SMB2_LOGOFF_HE]),
atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -425,15 +450,20 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
__le16 *utf16_path;
int rc;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- __u64 persistent_fid, volatile_fid;
+ struct cifs_open_parms oparms;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
- rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0,
- &oplock, NULL);
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
kfree(utf16_path);
if (rc) {
cifs_dbg(VFS, "open dir failed\n");
@@ -442,14 +472,12 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
srch_inf->entries_in_buffer = 0;
srch_inf->index_of_last_entry = 0;
- fid->persistent_fid = persistent_fid;
- fid->volatile_fid = volatile_fid;
- rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
- srch_inf);
+ rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
+ fid->volatile_fid, 0, srch_inf);
if (rc) {
cifs_dbg(VFS, "query directory failed\n");
- SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
return rc;
}
@@ -510,17 +538,25 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
struct kstatfs *buf)
{
int rc;
- u64 persistent_fid, volatile_fid;
__le16 srch_path = 0; /* Null - open root of share */
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
- rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
if (rc)
return rc;
buf->f_type = SMB2_MAGIC_NUMBER;
- rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf);
- SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+ rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ buf);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
return rc;
}
@@ -645,6 +681,7 @@ struct smb_version_operations smb30_operations = {
.dump_detail = smb2_dump_detail,
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
+ .dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
@@ -690,6 +727,7 @@ struct smb_version_operations smb30_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
+ .generate_signingkey = generate_smb3signingkey,
.calc_signature = smb3_calc_signature,
};
@@ -709,6 +747,8 @@ struct smb_version_values smb20_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
struct smb_version_values smb21_values = {
@@ -727,6 +767,8 @@ struct smb_version_values smb21_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
struct smb_version_values smb30_values = {
@@ -745,4 +787,26 @@ struct smb_version_values smb30_values = {
.cap_nt_find = SMB2_NT_FIND,
.cap_large_files = SMB2_LARGE_FILES,
.oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+};
+
+struct smb_version_values smb302_values = {
+ .version_string = SMB302_VERSION_STRING,
+ .protocol_id = SMB302_PROT_ID,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .oplock_read = SMB2_OPLOCK_LEVEL_II,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b95ce2b54e8..abc9c2809b51 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
/*
* fs/cifs/smb2pdu.c
*
- * Copyright (C) International Business Machines Corp., 2009, 2012
+ * Copyright (C) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
if (!tcon)
goto out;
+ /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
+ /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+ if ((tcon->ses) &&
+ (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ hdr->CreditCharge = cpu_to_le16(1);
+ /* else CreditCharge MBZ */
+
hdr->TreeId = tcon->tid;
/* Uid is not converted */
if (tcon->ses)
hdr->SessionId = tcon->ses->Suid;
- /* BB check following DFS flags BB */
- /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */
- if (tcon->share_flags & SHI1005_FLAGS_DFS)
- hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS;
- /* BB how does SMB2 do case sensitive? */
- /* if (tcon->nocase)
- hdr->Flags |= SMBFLG_CASELESS; */
- if (tcon->ses && tcon->ses->server &&
- (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED))
+
+ /*
+ * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
+ * to pass the path on the Open SMB prefixed by \\server\share.
+ * Not sure when we would need to do the augmented path (if ever) and
+ * setting this flag breaks the SMB2 open operation since it is
+ * illegal to send an empty path name (without \\server\share prefix)
+ * when the DFS flag is set in the SMB open header. We could
+ * consider setting the flag on all operations other than open
+ * but it is safer to net set it for now.
+ */
+/* if (tcon->share_flags & SHI1005_FLAGS_DFS)
+ hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+
+ if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
hdr->Flags |= SMB2_FLAGS_SIGNED;
out:
pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
struct kvec iov[1];
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server;
- unsigned int sec_flags;
- u16 temp = 0;
+ struct TCP_Server_Info *server = ses->server;
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
cifs_dbg(FYI, "Negotiate protocol\n");
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
- else /* if override flags set only sign/seal OR them with global auth */
- sec_flags = global_secflags | ses->overrideSecFlg;
-
- cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
-
req->hdr.SessionId = 0;
req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
inc_rfc1001_len(req, 2);
/* only one of SMB2 signing flags may be set in SMB2 request */
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
- temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
- req->SecurityMode = cpu_to_le16(temp);
+ if (ses->sign)
+ req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+ else if (global_secflags & CIFSSEC_MAY_SIGN)
+ req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+ else
+ req->SecurityMode = 0;
req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
@@ -399,6 +401,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+ else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+ cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
else {
cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
le16_to_cpu(rsp->DialectRevision));
@@ -407,6 +411,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
}
server->dialect = le16_to_cpu(rsp->DialectRevision);
+ /* SMB2 only has an extended negflavor */
+ server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
server->max_read = le32_to_cpu(rsp->MaxReadSize);
server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,44 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
&rsp->hdr);
- if (blob_length == 0) {
- cifs_dbg(VFS, "missing security blob on negprot\n");
- rc = -EIO;
- goto neg_exit;
- }
-
- cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
- cifs_dbg(FYI, "Signing required\n");
- if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
- SMB2_NEGOTIATE_SIGNING_ENABLED))) {
- cifs_dbg(VFS, "signing required but server lacks support\n");
- rc = -EOPNOTSUPP;
- goto neg_exit;
- }
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
- } else if (sec_flags & CIFSSEC_MAY_SIGN) {
- cifs_dbg(FYI, "Signing optional\n");
- if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
- cifs_dbg(FYI, "Server requires signing\n");
- server->sec_mode |= SECMODE_SIGN_REQUIRED;
- } else {
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- }
- } else {
- cifs_dbg(FYI, "Signing disabled\n");
- if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
- cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
- rc = -EOPNOTSUPP;
- goto neg_exit;
- }
- server->sec_mode &=
- ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
- }
+ /*
+ * See MS-SMB2 section 2.2.4: if no blob, client picks default which
+ * for us will be
+ * ses->sectype = RawNTLMSSP;
+ * but for time being this is our only auth choice so doesn't matter.
+ * We just found a server which sets blob length to zero expecting raw.
+ */
+ if (blob_length == 0)
+ cifs_dbg(FYI, "missing security blob on negprot\n");
+ rc = cifs_enable_signing(server, ses->sign);
#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
- rc = decode_neg_token_init(security_blob, blob_length,
+ if (rc)
+ goto neg_exit;
+ if (blob_length)
+ rc = decode_neg_token_init(security_blob, blob_length,
&server->sec_type);
if (rc == 1)
rc = 0;
@@ -480,9 +464,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
int rc = 0;
int resp_buftype;
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
- struct TCP_Server_Info *server;
- unsigned int sec_flags;
- u8 temp = 0;
+ struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
char *security_blob;
char *ntlmssp_blob = NULL;
@@ -490,11 +472,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "Session Setup\n");
- if (ses->server)
- server = ses->server;
- else {
- rc = -EIO;
- return rc;
+ if (!server) {
+ WARN(1, "%s: server is NULL!\n", __func__);
+ return -EIO;
}
/*
@@ -505,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
if (!ses->ntlmssp)
return -ENOMEM;
- ses->server->secType = RawNTLMSSP;
+ /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+ ses->sectype = RawNTLMSSP;
ssetup_ntlmssp_authenticate:
if (phase == NtLmChallenge)
@@ -515,28 +496,19 @@ ssetup_ntlmssp_authenticate:
if (rc)
return rc;
- /* if any of auth flags (ie not sign or seal) are overriden use them */
- if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
- sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
- else /* if override flags set only sign/seal OR them with global auth */
- sec_flags = global_secflags | ses->overrideSecFlg;
-
- cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
-
req->hdr.SessionId = 0; /* First session, not a reauthenticate */
req->VcNumber = 0; /* MBZ */
/* to enable echos and oplocks */
req->hdr.CreditRequest = cpu_to_le16(3);
/* only one of SMB2 signing flags may be set in SMB2 request */
- if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED)
- temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
- else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
- temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
- req->SecurityMode = temp;
+ if (server->sign)
+ req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+ else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
+ req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
+ else
+ req->SecurityMode = 0;
+
req->Capabilities = 0;
req->Channel = 0; /* MBZ */
@@ -679,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
/* since no tcon, smb2_init can not do this, so do here */
req->hdr.SessionId = ses->Suid;
- if (server->sec_mode & SECMODE_SIGN_REQUIRED)
+ if (server->sign)
req->hdr.Flags |= SMB2_FLAGS_SIGNED;
rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -788,11 +760,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
}
tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+ tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
tcon->tid = rsp->hdr.TreeId;
- strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+ strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -874,29 +847,76 @@ create_lease_buf(u8 *lease_key, u8 oplock)
return buf;
}
+static struct create_durable *
+create_durable_buf(void)
+{
+ struct create_durable *buf;
+
+ buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable, Data));
+ buf->ccontext.DataLength = cpu_to_le32(16);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = 'n';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable *
+create_reconnect_durable_buf(struct cifs_fid *fid)
+{
+ struct create_durable *buf;
+
+ buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable, Data));
+ buf->ccontext.DataLength = cpu_to_le32(16);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Data.Fid.PersistentFileId = fid->persistent_fid;
+ buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = 'n';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
static __u8
parse_lease_state(struct smb2_create_rsp *rsp)
{
char *data_offset;
struct create_lease *lc;
bool found = false;
+ unsigned int next = 0;
+ char *name;
- data_offset = (char *)rsp;
- data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
lc = (struct create_lease *)data_offset;
do {
- char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
+ lc = (struct create_lease *)((char *)lc + next);
+ name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
strncmp(name, "RqLs", 4)) {
- lc = (struct create_lease *)((char *)lc
- + le32_to_cpu(lc->ccontext.Next));
+ next = le32_to_cpu(lc->ccontext.Next);
continue;
}
if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
found = true;
break;
- } while (le32_to_cpu(lc->ccontext.Next) != 0);
+ } while (next != 0);
if (!found)
return 0;
@@ -904,23 +924,74 @@ parse_lease_state(struct smb2_create_rsp *rsp)
return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
}
+static int
+add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_lease);
+ req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) - 4 +
+ iov[num - 1].iov_len);
+ req->CreateContextsLength = cpu_to_le32(
+ le32_to_cpu(req->CreateContextsLength) +
+ sizeof(struct create_lease));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (oparms->reconnect) {
+ iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+ } else
+ iov[num].iov_base = create_durable_buf();
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ req->CreateContextsLength =
+ cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
+ sizeof(struct create_durable));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
+ *num_iovec = num + 1;
+ return 0;
+}
+
int
-SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
- u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
- __u32 create_disposition, __u32 file_attributes, __u32 create_options,
+SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u8 *oplock, struct smb2_file_all_info *buf)
{
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
struct TCP_Server_Info *server;
+ struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct kvec iov[3];
+ struct kvec iov[4];
int resp_buftype;
int uni_path_len;
__le16 *copy_path = NULL;
int copy_size;
int rc = 0;
- int num_iovecs = 2;
+ unsigned int num_iovecs = 2;
+ __u32 file_attributes = 0;
cifs_dbg(FYI, "create/open\n");
@@ -933,55 +1004,47 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
if (rc)
return rc;
+ if (oparms->create_options & CREATE_OPTION_READONLY)
+ file_attributes |= ATTR_READONLY;
+
req->ImpersonationLevel = IL_IMPERSONATION;
- req->DesiredAccess = cpu_to_le32(desired_access);
+ req->DesiredAccess = cpu_to_le32(oparms->desired_access);
/* File attributes ignored on open (used in create though) */
req->FileAttributes = cpu_to_le32(file_attributes);
req->ShareAccess = FILE_SHARE_ALL_LE;
- req->CreateDisposition = cpu_to_le32(create_disposition);
- req->CreateOptions = cpu_to_le32(create_options);
+ req->CreateDisposition = cpu_to_le32(oparms->disposition);
+ req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
- req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)
- - 8 /* pad */ - 4 /* do not count rfc1001 len field */);
+ /* do not count rfc1001 len field */
+ req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field */
iov[0].iov_len = get_rfc1002_length(req) + 4;
/* MUST set path len (NameLength) to 0 opening root of share */
- if (uni_path_len >= 4) {
- req->NameLength = cpu_to_le16(uni_path_len - 2);
- /* -1 since last byte is buf[0] which is sent below (path) */
- iov[0].iov_len--;
- if (uni_path_len % 8 != 0) {
- copy_size = uni_path_len / 8 * 8;
- if (copy_size < uni_path_len)
- copy_size += 8;
-
- copy_path = kzalloc(copy_size, GFP_KERNEL);
- if (!copy_path)
- return -ENOMEM;
- memcpy((char *)copy_path, (const char *)path,
- uni_path_len);
- uni_path_len = copy_size;
- path = copy_path;
- }
-
- iov[1].iov_len = uni_path_len;
- iov[1].iov_base = path;
- /*
- * -1 since last byte is buf[0] which was counted in
- * smb2_buf_len.
- */
- inc_rfc1001_len(req, uni_path_len - 1);
- } else {
- iov[0].iov_len += 7;
- req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
- req->hdr.smb2_buf_length) + 8 - 1);
- num_iovecs = 1;
- req->NameLength = 0;
+ req->NameLength = cpu_to_le16(uni_path_len - 2);
+ /* -1 since last byte is buf[0] which is sent below (path) */
+ iov[0].iov_len--;
+ if (uni_path_len % 8 != 0) {
+ copy_size = uni_path_len / 8 * 8;
+ if (copy_size < uni_path_len)
+ copy_size += 8;
+
+ copy_path = kzalloc(copy_size, GFP_KERNEL);
+ if (!copy_path)
+ return -ENOMEM;
+ memcpy((char *)copy_path, (const char *)path,
+ uni_path_len);
+ uni_path_len = copy_size;
+ path = copy_path;
}
+ iov[1].iov_len = uni_path_len;
+ iov[1].iov_base = path;
+ /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
+ inc_rfc1001_len(req, uni_path_len - 1);
+
if (!server->oplocks)
*oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -989,21 +1052,29 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
*oplock == SMB2_OPLOCK_LEVEL_NONE)
req->RequestedOplockLevel = *oplock;
else {
- iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock);
- if (iov[num_iovecs].iov_base == NULL) {
+ rc = add_lease_context(iov, &num_iovecs, oplock);
+ if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
- return -ENOMEM;
+ return rc;
+ }
+ }
+
+ if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+ /* need to set Next field of lease context if we request it */
+ if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[num_iovecs-1].iov_base;
+ ccontext->Next =
+ cpu_to_le32(sizeof(struct create_lease));
+ }
+ rc = add_durable_context(iov, &num_iovecs, oparms);
+ if (rc) {
+ cifs_small_buf_release(req);
+ kfree(copy_path);
+ kfree(iov[num_iovecs-1].iov_base);
+ return rc;
}
- iov[num_iovecs].iov_len = sizeof(struct create_lease);
- req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) - 4 - 8 +
- iov[num_iovecs-1].iov_len);
- req->CreateContextsLength = cpu_to_le32(
- sizeof(struct create_lease));
- inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
- num_iovecs++;
}
rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -1014,8 +1085,8 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
goto creat_exit;
}
- *persistent_fid = rsp->PersistentFileId;
- *volatile_fid = rsp->VolatileFileId;
+ oparms->fid->persistent_fid = rsp->PersistentFileId;
+ oparms->fid->volatile_fid = rsp->VolatileFileId;
if (buf) {
memcpy(buf, &rsp->CreationTime, 32);
@@ -1036,6 +1107,122 @@ creat_exit:
return rc;
}
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+ u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb2_ioctl_req *req;
+ struct smb2_ioctl_rsp *rsp;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses = tcon->ses;
+ struct kvec iov[2];
+ int resp_buftype;
+ int num_iovecs;
+ int rc = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (ses && (ses->server))
+ server = ses->server;
+ else
+ return -EIO;
+
+ rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+ if (rc)
+ return rc;
+
+ req->CtlCode = cpu_to_le32(opcode);
+ req->PersistentFileId = persistent_fid;
+ req->VolatileFileId = volatile_fid;
+
+ if (indatalen) {
+ req->InputCount = cpu_to_le32(indatalen);
+ /* do not set InputOffset if no input data */
+ req->InputOffset =
+ cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+ iov[1].iov_base = in_data;
+ iov[1].iov_len = indatalen;
+ num_iovecs = 2;
+ } else
+ num_iovecs = 1;
+
+ req->OutputOffset = 0;
+ req->OutputCount = 0; /* MBZ */
+
+ /*
+ * Could increase MaxOutputResponse, but that would require more
+ * than one credit. Windows typically sets this smaller, but for some
+ * ioctls it may be useful to allow server to send more. No point
+ * limiting what the server can send as long as fits in one credit
+ */
+ req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+
+ if (is_fsctl)
+ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+ else
+ req->Flags = 0;
+
+ iov[0].iov_base = (char *)req;
+ /* 4 for rfc1002 length field */
+ iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+ if (indatalen)
+ inc_rfc1001_len(req, indatalen);
+
+ rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+ rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+
+ if (rc != 0) {
+ if (tcon)
+ cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+ goto ioctl_exit;
+ }
+
+ /* check if caller wants to look at return data or just return rc */
+ if ((plen == NULL) || (out_data == NULL))
+ goto ioctl_exit;
+
+ *plen = le32_to_cpu(rsp->OutputCount);
+
+ /* We check for obvious errors in the output buffer length and offset */
+ if (*plen == 0)
+ goto ioctl_exit; /* server returned no data */
+ else if (*plen > 0xFF00) {
+ cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+ *plen = 0;
+ rc = -EIO;
+ goto ioctl_exit;
+ }
+
+ if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+ cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+ le32_to_cpu(rsp->OutputOffset));
+ *plen = 0;
+ rc = -EIO;
+ goto ioctl_exit;
+ }
+
+ *out_data = kmalloc(*plen, GFP_KERNEL);
+ if (*out_data == NULL) {
+ rc = -ENOMEM;
+ goto ioctl_exit;
+ }
+
+ memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+ *plen);
+ioctl_exit:
+ free_rsp_buf(resp_buftype, rsp);
+ return rc;
+}
+
int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
@@ -1384,8 +1571,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
case MID_RESPONSE_RECEIVED:
credits_received = le16_to_cpu(buf->CreditRequest);
/* result already set, check signature */
- if (server->sec_mode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
int rc;
rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258cb..36b0d37ea69b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
/*
* fs/cifs/smb2pdu.h
*
- * Copyright (c) International Business Machines Corp., 2009, 2010
+ * Copyright (c) International Business Machines Corp., 2009, 2013
* Etersoft, 2012
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
#define SMB20_PROT_ID 0x0202
#define SMB21_PROT_ID 0x0210
#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
#define BAD_PROT_ID 0xFFFF
/* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
+#define SHI1005_FLAGS_ALL 0x0000FF33
/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008)
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
struct smb2_tree_disconnect_req {
struct smb2_hdr hdr;
@@ -420,7 +428,7 @@ struct smb2_create_req {
__le16 NameLength;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
- __u8 Buffer[8];
+ __u8 Buffer[0];
} __packed;
struct smb2_create_rsp {
@@ -477,6 +485,87 @@ struct create_lease {
struct lease_context lcontext;
} __packed;
+struct create_durable {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+ char SourceKey[24];
+ __le32 ChunkCount; /* we are only sending 1 */
+ __le32 Reserved;
+ /* array will only be one chunk long for us */
+ __le64 SourceOffset;
+ __le64 TargetOffset;
+ __le32 Length; /* how many bytes to copy */
+ __u32 Reserved2;
+} __packed;
+
+/* Response and Request are the same format */
+struct validate_negotiate_info {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 DialectCount;
+ __le16 Dialect[1];
+} __packed;
+
+#define RSS_CAPABLE 0x00000001
+#define RDMA_CAPABLE 0x00000002
+
+struct network_interface_info_ioctl_rsp {
+ __le32 Next; /* next interface. zero if this is last one */
+ __le32 IfIndex;
+ __le32 Capability; /* RSS or RDMA Capable */
+ __le32 Reserved;
+ __le64 LinkSpeed;
+ char SockAddr_Storage[128];
+} __packed;
+
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+
+struct smb2_ioctl_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 InputOffset;
+ __le32 InputCount;
+ __le32 MaxInputResponse;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 MaxOutputResponse;
+ __le32 Flags;
+ __u32 Reserved2;
+ char Buffer[0];
+} __packed;
+
+struct smb2_ioctl_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
+ __le32 InputOffset;
+ __le32 InputCount;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 Flags;
+ __u32 Reserved2;
+ /* char * buffer[] */
+} __packed;
+
/* Currently defined values for close flags */
#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
struct smb2_close_req {
@@ -517,17 +606,25 @@ struct smb2_flush_rsp {
__le16 Reserved;
} __packed;
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED 0x01
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE 0x00000000
+#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+
struct smb2_read_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 49 */
__u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Reserved;
+ __u8 Flags; /* MBZ unless SMB3.02 or later */
__le32 Length;
__le64 Offset;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
__le32 MinimumCount;
- __le32 Channel; /* Reserved MBZ */
+ __le32 Channel; /* MBZ except for SMB3 or later */
__le32 RemainingBytes;
__le16 ReadChannelInfoOffset; /* Reserved MBZ */
__le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +642,9 @@ struct smb2_read_rsp {
__u8 Buffer[1];
} __packed;
-/* For write request Flags field below the following flag is defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
struct smb2_write_req {
struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38ce..1a5ecbed40ed 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,11 +84,9 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *from_name, const char *to_name,
struct cifs_sb_info *cifs_sb);
-extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon,
- const char *full_path, int disposition,
- int desired_access, int create_options,
- struct cifs_fid *fid, __u32 *oplock,
- FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
+extern int smb2_open_file(const unsigned int xid,
+ struct cifs_open_parms *oparms,
+ __u32 *oplock, FILE_ALL_INFO *buf);
extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
extern int smb2_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
@@ -106,11 +104,13 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
const char *tree, struct cifs_tcon *tcon,
const struct nls_table *);
extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
-extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
- __le16 *path, u64 *persistent_fid, u64 *volatile_fid,
- __u32 desired_access, __u32 create_disposition,
- __u32 file_attributes, __u32 create_options,
- __u8 *oplock, struct smb2_file_all_info *buf);
+extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
+ __le16 *path, __u8 *oplock,
+ struct smb2_file_all_info *buf);
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ char **out_data, u32 *plen /* returned data len */);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 01f0ac800780..4f2300d020c7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,6 +39,82 @@
#include "smb2status.h"
#include "smb2glob.h"
+static int
+smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ int rc;
+ unsigned int size;
+
+ if (server->secmech.sdeschmacsha256 != NULL)
+ return 0; /* already allocated */
+
+ server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ if (IS_ERR(server->secmech.hmacsha256)) {
+ cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+ rc = PTR_ERR(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ return rc;
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.hmacsha256);
+ server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdeschmacsha256) {
+ crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
+ server->secmech.sdeschmacsha256->shash.flags = 0x0;
+
+ return 0;
+}
+
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ unsigned int size;
+ int rc;
+
+ if (server->secmech.sdesccmacaes != NULL)
+ return 0; /* already allocated */
+
+ rc = smb2_crypto_shash_allocate(server);
+ if (rc)
+ return rc;
+
+ server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+ if (IS_ERR(server->secmech.cmacaes)) {
+ cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+ kfree(server->secmech.sdeschmacsha256);
+ server->secmech.sdeschmacsha256 = NULL;
+ crypto_free_shash(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ rc = PTR_ERR(server->secmech.cmacaes);
+ server->secmech.cmacaes = NULL;
+ return rc;
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->secmech.cmacaes);
+ server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+ if (!server->secmech.sdesccmacaes) {
+ cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+ kfree(server->secmech.sdeschmacsha256);
+ server->secmech.sdeschmacsha256 = NULL;
+ crypto_free_shash(server->secmech.hmacsha256);
+ crypto_free_shash(server->secmech.cmacaes);
+ server->secmech.hmacsha256 = NULL;
+ server->secmech.cmacaes = NULL;
+ return -ENOMEM;
+ }
+ server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+ server->secmech.sdesccmacaes->shash.flags = 0x0;
+
+ return 0;
+}
+
+
int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
@@ -52,6 +128,12 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+ rc = smb2_crypto_shash_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
+ return rc;
+ }
+
rc = crypto_shash_setkey(server->secmech.hmacsha256,
server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
@@ -61,7 +143,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init sha256", __func__);
return rc;
}
@@ -116,11 +198,166 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
+void
+generate_smb3signingkey(struct TCP_Server_Info *server)
+{
+ unsigned char zero = 0x0;
+ __u8 i[4] = {0, 0, 0, 1};
+ __u8 L[4] = {0, 0, 0, 128};
+ int rc = 0;
+ unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+ unsigned char *hashptr = prfhash;
+
+ memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+ memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+
+ rc = smb3_crypto_shash_allocate(server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_setkey(server->secmech.hmacsha256,
+ server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ i, 4);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ "SMB2AESCMAC", 12);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ &zero, 1);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ "SmbSign", 8);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ L, 4);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+ hashptr);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+ goto smb3signkey_ret;
+ }
+
+ memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+
+smb3signkey_ret:
+ return;
+}
+
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
- cifs_dbg(FYI, "smb3 signatures not supported yet\n");
- return -EOPNOTSUPP;
+ int i, rc;
+ unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+ unsigned char *sigptr = smb3_signature;
+ struct kvec *iov = rqst->rq_iov;
+ int n_vec = rqst->rq_nvec;
+ struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+
+ memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+ memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+ rc = crypto_shash_setkey(server->secmech.cmacaes,
+ server->smb3signingkey, SMB2_CMACAES_SIZE);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+ return rc;
+ }
+
+ /*
+ * we already allocate sdesccmacaes when we init smb3 signing key,
+ * so unlike smb2 case we do not have to check here if secmech are
+ * initialized
+ */
+ rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+ return rc;
+ }
+
+ for (i = 0; i < n_vec; i++) {
+ if (iov[i].iov_len == 0)
+ continue;
+ if (iov[i].iov_base == NULL) {
+ cifs_dbg(VFS, "null iovec entry");
+ return -EIO;
+ }
+ /*
+ * The first entry includes a length field (which does not get
+ * signed that occupies the first 4 bytes before the header).
+ */
+ if (i == 0) {
+ if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+ break; /* nothing to sign or corrupt header */
+ rc =
+ crypto_shash_update(
+ &server->secmech.sdesccmacaes->shash,
+ iov[i].iov_base + 4, iov[i].iov_len - 4);
+ } else {
+ rc =
+ crypto_shash_update(
+ &server->secmech.sdesccmacaes->shash,
+ iov[i].iov_base, iov[i].iov_len);
+ }
+ if (rc) {
+ cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+ __func__);
+ return rc;
+ }
+ }
+
+ /* now hash over the rq_pages array */
+ for (i = 0; i < rqst->rq_npages; i++) {
+ struct kvec p_iov;
+
+ cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+ crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+ p_iov.iov_base, p_iov.iov_len);
+ kunmap(rqst->rq_pages[i]);
+ }
+
+ rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+ sigptr);
+ if (rc)
+ cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+
+ memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+ return rc;
}
/* must be called with server->srv_mutex held */
@@ -275,8 +512,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dump_smb(mid->resp_buf, min_t(u32, 80, len));
/* convert the length into a more usable form */
- if ((len > 24) &&
- (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
+ if (len > 24 && server->sign) {
int rc;
rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e087..d952ee48f4dc 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
/*
* fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
*
- * Copyright (c) International Business Machines Corp., 2002,2009
+ * Copyright (c) International Business Machines Corp., 2002,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
/* IOCTL information */
/*
* List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2 client. There is probably
+ * future to remote clients like cifs or SMB2/SMB3 client. This is probably
* a slightly larger set of fsctls that NTFS local filesystem could handle,
* including the seven below that we do not have struct definitions for.
* Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
* remotely. Some of the following, such as the encryption/compression ones
* could be invoked from tools via a specialized hook into the VFS rather
* than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
*/
+#define FSCTL_DFS_GET_REFERRALS 0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0
#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
@@ -71,14 +77,31 @@
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
#define FSCTL_SIS_LINK_FILES 0x0009C104
#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
/* strange that the number for this op is not sequential with previous op */
#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */
#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
#define IO_REPARSE_TAG_HSM 0xC0000004
#define IO_REPARSE_TAG_SIS 0x80000007
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
+
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfbf4700d160..6fdcb1b4a106 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_freezekillable(server->response_q,
+ error = wait_event_freezekillable_unsafe(server->response_q,
midQ->mid_state != MID_REQUEST_SUBMITTED);
if (error < 0)
return -ERESTARTSYS;
@@ -463,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
struct mid_q_entry *mid;
/* enable signing if server requires it */
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ if (server->sign)
hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
mid = AllocMidQEntry(hdr, server);
@@ -612,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dump_smb(mid->resp_buf, min_t(u32, 92, len));
/* convert the length into a more usable form */
- if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ if (server->sign) {
struct kvec iov;
int rc = 0;
struct smb_rqst rqst = { .rq_iov = &iov,
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 14a14808320c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -526,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
if (cii->c_flags & C_FLUSH)
coda_flag_inode_children(inode, C_FLUSH);
- if (de->d_count > 1)
+ if (d_count(de) > 1)
/* pretend it's valid, but don't change the flags */
goto out;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 64e5323cbbb0..277bd1be21fd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
- pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+ pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
dput(parent);
}
@@ -660,19 +660,15 @@ static int create_default_group(struct config_group *parent_group,
struct config_group *group)
{
int ret;
- struct qstr name;
struct configfs_dirent *sd;
/* We trust the caller holds a reference to parent */
struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
if (!group->cg_item.ci_name)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
- name.name = group->cg_item.ci_name;
- name.len = strlen(name.name);
- name.hash = full_name_hash(name.name, name.len);
ret = -ENOMEM;
- child = d_alloc(parent, &name);
+ child = d_alloc_name(parent, group->cg_item.ci_name);
if (child) {
d_add(child, NULL);
@@ -1650,7 +1646,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
struct config_group *group = &subsys->su_group;
- struct qstr name;
struct dentry *dentry;
struct dentry *root;
struct configfs_dirent *sd;
@@ -1667,12 +1662,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
- name.name = group->cg_item.ci_name;
- name.len = strlen(name.name);
- name.hash = full_name_hash(name.name, name.len);
-
err = -ENOMEM;
- dentry = d_alloc(root, &name);
+ dentry = d_alloc_name(root, group->cg_item.ci_name);
if (dentry) {
d_add(dentry, NULL);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, count);
+ len = flush_write_buffer(file->f_path.dentry, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index 5dec39bd5ad2..fb4e2b7ea07b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
#include <trace/events/sched.h>
int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
struct core_name {
char *corename;
int used, size;
};
-static atomic_t call_count = ATOMIC_INIT(1);
/* The maximal length of core_pattern is also specified in sysctl.c */
-static int expand_corename(struct core_name *cn)
+static int expand_corename(struct core_name *cn, int size)
{
- char *old_corename = cn->corename;
-
- cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
- cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+ char *corename = krealloc(cn->corename, size, GFP_KERNEL);
- if (!cn->corename) {
- kfree(old_corename);
+ if (!corename)
return -ENOMEM;
- }
+ if (size > core_name_size) /* racy but harmless */
+ core_name_size = size;
+
+ cn->size = ksize(corename);
+ cn->corename = corename;
return 0;
}
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+ int free, need;
+
+again:
+ free = cn->size - cn->used;
+ need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+ if (need < free) {
+ cn->used += need;
+ return 0;
+ }
+
+ if (!expand_corename(cn, cn->size + need - free + 1))
+ goto again;
+
+ return -ENOMEM;
+}
+
static int cn_printf(struct core_name *cn, const char *fmt, ...)
{
- char *cur;
- int need;
- int ret;
va_list arg;
+ int ret;
va_start(arg, fmt);
- need = vsnprintf(NULL, 0, fmt, arg);
+ ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
- if (likely(need < cn->size - cn->used - 1))
- goto out_printf;
+ return ret;
+}
- ret = expand_corename(cn);
- if (ret)
- goto expand_fail;
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+{
+ int cur = cn->used;
+ va_list arg;
+ int ret;
-out_printf:
- cur = cn->corename + cn->used;
va_start(arg, fmt);
- vsnprintf(cur, need + 1, fmt, arg);
+ ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
- cn->used += need;
- return 0;
-expand_fail:
+ for (; cur < cn->used; ++cur) {
+ if (cn->corename[cur] == '/')
+ cn->corename[cur] = '!';
+ }
return ret;
}
-static void cn_escape(char *str)
-{
- for (; *str; str++)
- if (*str == '/')
- *str = '!';
-}
-
static int cn_print_exe_file(struct core_name *cn)
{
struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
int ret;
exe_file = get_mm_exe_file(current->mm);
- if (!exe_file) {
- char *commstart = cn->corename + cn->used;
- ret = cn_printf(cn, "%s (path unknown)", current->comm);
- cn_escape(commstart);
- return ret;
- }
+ if (!exe_file)
+ return cn_esc_printf(cn, "%s (path unknown)", current->comm);
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
goto free_buf;
}
- cn_escape(path);
-
- ret = cn_printf(cn, "%s", path);
+ ret = cn_esc_printf(cn, "%s", path);
free_buf:
kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
int pid_in_pattern = 0;
int err = 0;
- cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
- cn->corename = kmalloc(cn->size, GFP_KERNEL);
cn->used = 0;
-
- if (!cn->corename)
+ cn->corename = NULL;
+ if (expand_corename(cn, core_name_size))
return -ENOMEM;
+ cn->corename[0] = '\0';
+
+ if (ispipe)
+ ++pat_ptr;
/* Repeat as long as we have more pattern to process and more output
space */
while (*pat_ptr) {
if (*pat_ptr != '%') {
- if (*pat_ptr == 0)
- goto out;
err = cn_printf(cn, "%c", *pat_ptr++);
} else {
switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
}
/* hostname */
- case 'h': {
- char *namestart = cn->corename + cn->used;
+ case 'h':
down_read(&uts_sem);
- err = cn_printf(cn, "%s",
+ err = cn_esc_printf(cn, "%s",
utsname()->nodename);
up_read(&uts_sem);
- cn_escape(namestart);
break;
- }
/* executable */
- case 'e': {
- char *commstart = cn->corename + cn->used;
- err = cn_printf(cn, "%s", current->comm);
- cn_escape(commstart);
+ case 'e':
+ err = cn_esc_printf(cn, "%s", current->comm);
break;
- }
case 'E':
err = cn_print_exe_file(cn);
break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
return err;
}
+out:
/* Backward compatibility with core_uses_pid:
*
* If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
if (err)
return err;
}
-out:
return ispipe;
}
@@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo)
if (ispipe < 0) {
printk(KERN_WARNING "format_corename failed\n");
printk(KERN_WARNING "Aborting core\n");
- goto fail_corename;
+ goto fail_unlock;
}
if (cprm.limit == 1) {
@@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
goto fail_dropcount;
}
- helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+ helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
@@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
argv_free(helper_argv);
if (retval) {
- printk(KERN_INFO "Core dump to %s pipe failed\n",
+ printk(KERN_INFO "Core dump to |%s pipe failed\n",
cn.corename);
goto close_fail;
}
@@ -669,7 +667,6 @@ fail_dropcount:
atomic_dec(&core_dump_count);
fail_unlock:
kfree(cn.corename);
-fail_corename:
coredump_finish(mm, core_dumped);
revert_creds(old_cred);
fail_creds:
diff --git a/fs/dcache.c b/fs/dcache.c
index 3199fe6863a8..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
- } else
- d_add(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry))
+ d_rehash(dentry);
+ }
return new;
}
EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/atomic.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+ atomic_set((atomic_t *)data, val);
+ return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+ *val = atomic_read((atomic_t *)data);
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+ debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_wo);
+
+ return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
static ssize_t read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
+ buf[buf_size] = '\0';
if (strtobool(buf, &bv) == 0)
*val = bv;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3fdef7..c7c83ff0f752 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
*/
void debugfs_remove_recursive(struct dentry *dentry)
{
- struct dentry *child;
- struct dentry *parent;
+ struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
return;
parent = dentry;
+ down:
mutex_lock(&parent->d_inode->i_mutex);
+ list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+ if (!debugfs_positive(child))
+ continue;
- while (1) {
- /*
- * When all dentries under "parent" has been removed,
- * walk up the tree until we reach our starting point.
- */
- if (list_empty(&parent->d_subdirs)) {
- mutex_unlock(&parent->d_inode->i_mutex);
- if (parent == dentry)
- break;
- parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
- }
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
-
- /*
- * If "child" isn't empty, walk down the tree and
- * remove all its descendants first.
- */
+ /* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
- mutex_lock(&parent->d_inode->i_mutex);
- continue;
+ goto down;
}
- __debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
- /*
- * Avoid infinite loop if we fail to remove
- * one dentry.
- */
- mutex_unlock(&parent->d_inode->i_mutex);
- break;
- }
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ up:
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
- parent = dentry->d_parent;
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
- __debugfs_remove(dentry, parent);
+
+ if (child != dentry) {
+ next = list_entry(child->d_u.d_child.next, struct dentry,
+ d_u.d_child);
+ goto up;
+ }
+
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
mutex_unlock(&parent->d_inode->i_mutex);
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
const char *buf, size_t len)
{
- strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
- strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strlcpy(dlm_config.ci_cluster_name, buf,
+ sizeof(dlm_config.ci_cluster_name));
+ strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
int len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > r->res_ls->ls_lvblen)
+ len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
lkb->lkb_lvbseq = ms->m_lvbseq;
}
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > ls->ls_lvblen)
+ len = ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
}
return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
void dlm_stop_lockspaces(void)
{
struct dlm_ls *ls;
+ int count;
restart:
+ count = 0;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
- if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+ count++;
continue;
+ }
spin_unlock(&lslist_lock);
log_error(ls, "no userland control daemon, stopping lockspace");
dlm_ls_stop(ls);
goto restart;
}
spin_unlock(&lslist_lock);
+
+ if (count)
+ log_print("dlm user daemon left %d lockspaces", count);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
#include <linux/mutex.h>
#include <linux/sctp.h>
#include <linux/slab.h>
-#include <linux/sctp.h>
#include <net/sctp/sctp.h>
#include <net/ipv6.h>
@@ -126,6 +125,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
+ bool try_new_addr;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
struct list_head list;
int nodeid;
int addr_count;
+ int curr_addr_index;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
}
static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
- struct sockaddr *sa_out)
+ struct sockaddr *sa_out, bool try_new_addr)
{
struct sockaddr_storage sas;
struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
- if (na && na->addr_count)
- memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+ if (na && na->addr_count) {
+ if (try_new_addr) {
+ na->curr_addr_index++;
+ if (na->curr_addr_index == na->addr_count)
+ na->curr_addr_index = 0;
+ }
+
+ memcpy(&sas, na->addr[na->curr_addr_index ],
+ sizeof(struct sockaddr_storage));
+ }
spin_unlock(&dlm_node_addrs_spin);
if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
struct dlm_node_addr *na;
int rv = -EEXIST;
+ int addr_i;
spin_lock(&dlm_node_addrs_spin);
list_for_each_entry(na, &dlm_node_addrs, list) {
if (!na->addr_count)
continue;
- if (!addr_compare(na->addr[0], addr))
- continue;
-
- *nodeid = na->nodeid;
- rv = 0;
- break;
+ for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+ if (addr_compare(na->addr[addr_i], addr)) {
+ *nodeid = na->nodeid;
+ rv = 0;
+ goto unlock;
+ }
+ }
}
+unlock:
spin_unlock(&dlm_node_addrs_spin);
return rv;
}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
static void sctp_init_failed_foreach(struct connection *con)
{
+
+ /*
+ * Don't try to recover base con and handle race where the
+ * other node's assoc init creates a assoc and we get that
+ * notification, then we get a notification that our attempt
+ * failed due. This happens when we are still trying the primary
+ * address, but the other node has already tried secondary addrs
+ * and found one that worked.
+ */
+ if (!con->nodeid || con->sctp_assoc)
+ return;
+
+ log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+ con->try_new_addr = true;
con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
mutex_unlock(&connections_lock);
}
+static void retry_failed_sctp_send(struct connection *recv_con,
+ struct sctp_send_failed *sn_send_failed,
+ char *buf)
+{
+ int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+ struct dlm_mhandle *mh;
+ struct connection *con;
+ char *retry_buf;
+ int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+ log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ log_print("Could not look up con for nodeid %d\n",
+ nodeid);
+ return;
+ }
+
+ mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+ if (!mh) {
+ log_print("Could not allocate buf for retry.");
+ return;
+ }
+ memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+ dlm_lowcomms_commit_buffer(mh);
+
+ /*
+ * If we got a assoc changed event before the send failed event then
+ * we only need to retry the send.
+ */
+ if (con->sctp_assoc) {
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+ } else
+ sctp_init_failed_foreach(con);
+}
+
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
- if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_header.sn_type) {
+ case SCTP_SEND_FAILED:
+ retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+ break;
+ case SCTP_ASSOC_CHANGE:
switch (sn->sn_assoc_change.sac_state) {
-
case SCTP_COMM_UP:
case SCTP_RESTART:
{
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+ new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+ new_con->try_new_addr = false;
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
+ clear_bit(CF_INIT_PENDING, &new_con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
}
break;
- /* We don't know which INIT failed, so clear the PENDING flags
- * on them all. if assoc_id is zero then it will then try
- * again */
-
case SCTP_CANT_STR_ASSOC:
{
+ /* Will retry init when we get the send failed notification */
log_print("Can't start SCTP association - retrying");
- sctp_init_failed();
}
break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
+ default:
+ ; /* fall through */
}
}
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
kfree(e);
}
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ }
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
int addrlen;
struct kvec iov[1];
+ mutex_lock(&con->sock_mutex);
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- return;
-
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
+ goto unlock;
- if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+ if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+ con->try_new_addr)) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
if (list_empty(&con->writequeue)) {
spin_unlock(&con->writequeue_lock);
log_print("writequeue empty for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
- spin_unlock(&con->writequeue_lock);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
+ spin_unlock(&con->writequeue_lock);
+
+ if (rem_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+ log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+ log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+ }
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+ sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo->sinfo_flags |= SCTP_ADDR_OVER;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
}
else {
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
spin_unlock(&con->writequeue_lock);
}
+
+unlock:
+ mutex_unlock(&con->sock_mutex);
}
/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
int result = -EINVAL, num = 1, i, addr_len;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
+ int one = 1;
if (!con)
return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
goto create_delsock;
}
+ result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+ sizeof(one));
+ if (result < 0)
+ log_print("Could not set SCTP NODELAY error %d\n", result);
+
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
}
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
out:
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cfa109a4d5a2..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv);
+#define DECRYPT 0
+#define ENCRYPT 1
/**
* ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
}
/**
- * encrypt_scatterlist
+ * crypt_scatterlist
* @crypt_stat: Pointer to the crypt_stat struct to initialize.
- * @dest_sg: Destination of encrypted data
- * @src_sg: Data to be encrypted
- * @size: Length of data to be encrypted
- * @iv: iv to use during encryption
+ * @dst_sg: Destination of the data after performing the crypto operation
+ * @src_sg: Data to be encrypted or decrypted
+ * @size: Length of data
+ * @iv: IV to use
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Returns the number of bytes encrypted; negative value on error
+ * Returns the number of bytes encrypted or decrypted; negative value on error
*/
-static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
+static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+ struct scatterlist *dst_sg,
+ struct scatterlist *src_sg, int size,
+ unsigned char *iv, int op)
{
struct ablkcipher_request *req = NULL;
struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->flags |= ECRYPTFS_KEY_SET;
}
mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
- ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
- rc = crypto_ablkcipher_encrypt(req);
+ ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+ rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
+ crypto_ablkcipher_decrypt(req);
if (rc == -EINPROGRESS || rc == -EBUSY) {
struct extent_crypt_result *ecr = req->base.data;
@@ -407,41 +400,43 @@ out:
}
/**
- * ecryptfs_lower_offset_for_extent
+ * lower_offset_for_page
*
* Convert an eCryptfs page index into a lower byte offset
*/
-static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
- struct ecryptfs_crypt_stat *crypt_stat)
+static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *page)
{
- (*offset) = ecryptfs_lower_header_size(crypt_stat)
- + (crypt_stat->extent_size * extent_num);
+ return ecryptfs_lower_header_size(crypt_stat) +
+ (page->index << PAGE_CACHE_SHIFT);
}
/**
- * ecryptfs_encrypt_extent
- * @enc_extent_page: Allocated page into which to encrypt the data in
- * @page
+ * crypt_extent
* @crypt_stat: crypt_stat containing cryptographic context for the
* encryption operation
- * @page: Page containing plaintext data extent to encrypt
+ * @dst_page: The page to write the result into
+ * @src_page: The page to read from
* @extent_offset: Page extent offset for use in generating IV
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
*
- * Encrypts one extent of data.
+ * Encrypts or decrypts one extent of data.
*
* Return zero on success; non-zero otherwise
*/
-static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page,
- unsigned long extent_offset)
+static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page,
+ struct page *src_page,
+ unsigned long extent_offset, int op)
{
+ pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+ struct scatterlist src_sg, dst_sg;
+ size_t extent_size = crypt_stat->extent_size;
int rc;
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
+ extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
- page, (extent_offset
- * crypt_stat->extent_size),
- crypt_stat->extent_size, extent_iv);
+
+ sg_init_table(&src_sg, 1);
+ sg_init_table(&dst_sg, 1);
+
+ sg_set_page(&src_sg, src_page, extent_size,
+ extent_offset * extent_size);
+ sg_set_page(&dst_sg, dst_page, extent_size,
+ extent_offset * extent_size);
+
+ rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
+ extent_iv, op);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to encrypt page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ printk(KERN_ERR "%s: Error attempting to crypt page with "
+ "page_index = [%ld], extent_offset = [%ld]; "
+ "rc = [%d]\n", __func__, page_index, extent_offset, rc);
goto out;
}
rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
char *enc_extent_virt;
struct page *enc_extent_page = NULL;
loff_t extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
"encrypted extent\n");
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, enc_extent_page, page,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
goto out;
}
- ecryptfs_lower_offset_for_extent(
- &offset, ((((loff_t)page->index)
- * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
- offset, crypt_stat->extent_size);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to write lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
- }
- rc = 0;
-out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
}
- return rc;
-}
-static int ecryptfs_decrypt_extent(struct page *page,
- struct ecryptfs_crypt_stat *crypt_stat,
- struct page *enc_extent_page,
- unsigned long extent_offset)
-{
- loff_t extent_base;
- char extent_iv[ECRYPTFS_MAX_IV_BYTES];
- int rc;
-
- extent_base = (((loff_t)page->index)
- * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
- rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
- (extent_offset
- * crypt_stat->extent_size),
- enc_extent_page, 0,
- crypt_stat->extent_size, extent_iv);
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ enc_extent_virt = kmap(enc_extent_page);
+ rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
+ PAGE_CACHE_SIZE);
+ kunmap(enc_extent_page);
if (rc < 0) {
- printk(KERN_ERR "%s: Error attempting to decrypt to page with "
- "page->index = [%ld], extent_offset = [%ld]; "
- "rc = [%d]\n", __func__, page->index, extent_offset,
- rc);
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to write lower page; rc = [%d]\n",
+ rc);
goto out;
}
rc = 0;
out:
+ if (enc_extent_page) {
+ __free_page(enc_extent_page);
+ }
return rc;
}
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- char *enc_extent_virt;
- struct page *enc_extent_page = NULL;
+ char *page_virt;
unsigned long extent_offset;
+ loff_t lower_offset;
int rc = 0;
ecryptfs_inode = page->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- enc_extent_page = alloc_page(GFP_USER);
- if (!enc_extent_page) {
- rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR, "Error allocating memory for "
- "encrypted extent\n");
+
+ lower_offset = lower_offset_for_page(crypt_stat, page);
+ page_virt = kmap(page);
+ rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+ ecryptfs_inode);
+ kunmap(page);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to read lower page; rc = [%d]\n",
+ rc);
goto out;
}
- enc_extent_virt = kmap(enc_extent_page);
+
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- loff_t offset;
-
- ecryptfs_lower_offset_for_extent(
- &offset, ((page->index * (PAGE_CACHE_SIZE
- / crypt_stat->extent_size))
- + extent_offset), crypt_stat);
- rc = ecryptfs_read_lower(enc_extent_virt, offset,
- crypt_stat->extent_size,
- ecryptfs_inode);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Error attempting "
- "to read lower page; rc = [%d]"
- "\n", rc);
- goto out;
- }
- rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
- extent_offset);
+ rc = crypt_extent(crypt_stat, page, page,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -638,140 +590,7 @@ int ecryptfs_decrypt_page(struct page *page)
}
}
out:
- if (enc_extent_page) {
- kunmap(enc_extent_page);
- __free_page(enc_extent_page);
- }
- return rc;
-}
-
-/**
- * decrypt_scatterlist
- * @crypt_stat: Cryptographic context
- * @dest_sg: The destination scatterlist to decrypt into
- * @src_sg: The source scatterlist to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted; negative value on error
- */
-static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
- struct scatterlist *dest_sg,
- struct scatterlist *src_sg, int size,
- unsigned char *iv)
-{
- struct ablkcipher_request *req = NULL;
- struct extent_crypt_result ecr;
- int rc = 0;
-
- BUG_ON(!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
- crypt_stat->key_size);
- ecryptfs_dump_hex(crypt_stat->key,
- crypt_stat->key_size);
- }
-
- init_completion(&ecr.completion);
-
- mutex_lock(&crypt_stat->cs_tfm_mutex);
- req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
- if (!req) {
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -ENOMEM;
- goto out;
- }
-
- ablkcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- extent_crypt_complete, &ecr);
- /* Consider doing this once, when the file is opened */
- if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
- rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_ERR,
- "Error setting key; rc = [%d]\n",
- rc);
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- rc = -EINVAL;
- goto out;
- }
- crypt_stat->flags |= ECRYPTFS_KEY_SET;
- }
- mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
- ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
- rc = crypto_ablkcipher_decrypt(req);
- if (rc == -EINPROGRESS || rc == -EBUSY) {
- struct extent_crypt_result *ecr = req->base.data;
-
- wait_for_completion(&ecr->completion);
- rc = ecr->rc;
- INIT_COMPLETION(ecr->completion);
- }
-out:
- ablkcipher_request_free(req);
return rc;
-
-}
-
-/**
- * ecryptfs_encrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to encrypt into
- * @dst_offset: The offset in the page to encrypt into
- * @src_page: The page to encrypt from
- * @src_offset: The offset in the page to encrypt from
- * @size: The number of bytes to encrypt
- * @iv: The initialization vector to use for the encryption
- *
- * Returns the number of bytes encrypted
- */
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_init_table(&dst_sg, 1);
-
- sg_set_page(&src_sg, src_page, size, src_offset);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
- return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
-
-/**
- * ecryptfs_decrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to decrypt into
- * @dst_offset: The offset in the page to decrypt into
- * @src_page: The page to decrypt from
- * @src_offset: The offset in the page to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted
- */
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *dst_page, int dst_offset,
- struct page *src_page, int src_offset, int size,
- unsigned char *iv)
-{
- struct scatterlist src_sg, dst_sg;
-
- sg_init_table(&src_sg, 1);
- sg_set_page(&src_sg, src_page, size, src_offset);
-
- sg_init_table(&dst_sg, 1);
- sg_set_page(&dst_sg, dst_page, size, dst_offset);
-
- return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
}
#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24f1105fda3a..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
unsigned long nr_segs, loff_t pos)
{
ssize_t rc;
- struct path lower;
+ struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,9 +60,8 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
if (-EIOCBQUEUED == rc)
rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
- lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
- lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
- touch_atime(&lower);
+ path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
+ touch_atime(path);
}
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a2f2bb2c256d..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
- BUG_ON(!lower_dentry->d_count);
+ BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
ecryptfs_set_dentry_lower(dentry, lower_dentry);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
+ rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+ "rc = [%d]\n", path->dentry, path->mnt, rc);
(*lower_file) = NULL;
}
return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
goto unlock;
}
msg_size = (sizeof(*msg) + msg->data_len);
- msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
+ msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
if (!msg_ctx->msg) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
"GFP_KERNEL memory\n", __func__, msg_size);
goto unlock;
}
- memcpy(msg_ctx->msg, msg, msg_size);
msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
wake_up_process(msg_ctx->task);
rc = 0;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 7e787fb90293..07ab49745e31 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -155,20 +155,8 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
return 0;
};
-/*
- * Handle negative dentry.
- */
-static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_add(dentry, NULL);
- return NULL;
-}
-
const struct inode_operations efivarfs_dir_inode_operations = {
- .lookup = efivarfs_lookup,
+ .lookup = simple_lookup,
.unlink = efivarfs_unlink,
.create = efivarfs_create,
};
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index deecc7294a67..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -1602,7 +1603,8 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!freezable_schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
@@ -1975,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
return -EINVAL;
if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
}
error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1993,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
sizeof(sigsaved));
set_restore_sigmask();
} else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ set_current_blocked(&sigsaved);
}
return error;
@@ -2020,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
return -EFAULT;
sigset_from_compat(&ksigmask, &csigmask);
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
}
err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2038,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
sizeof(sigsaved));
set_restore_sigmask();
} else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ set_current_blocked(&sigsaved);
}
return err;
diff --git a/fs/exec.c b/fs/exec.c
index 0f6c96c57b2f..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
* also take its birthdate (always earlier than our own).
*/
tsk->start_time = leader->start_time;
+ tsk->real_start_time = leader->real_start_time;
BUG_ON(!same_thread_group(leader, tsk));
BUG_ON(has_group_leader_pid(tsk));
@@ -947,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
* Note: The old leader also uses this pid until release_task
* is called. Odd but simple and correct.
*/
- detach_pid(tsk, PIDTYPE_PID);
tsk->pid = leader->pid;
- attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
+ change_pid(tsk, PIDTYPE_PID, task_pid(leader));
transfer_pid(leader, tsk, PIDTYPE_PGID);
transfer_pid(leader, tsk, PIDTYPE_SID);
@@ -1137,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm)
set_dumpable(current->mm, suid_dumpable);
}
- /*
- * Flush performance counters when crossing a
- * security domain:
- */
- if (!get_dumpable(current->mm))
- perf_event_exit_task(current);
-
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1207,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
+
+ /*
+ * Disable monitoring for regular users
+ * when executing setuid binaries. Must
+ * wait until new credentials are committed
+ * by commit_creds() above
+ */
+ if (get_dumpable(current->mm) != SUID_DUMP_USER)
+ perf_event_exit_task(current);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
@@ -1463,7 +1465,6 @@ static int do_execve_common(const char *filename,
struct files_struct *displaced;
bool clear_in_exec;
int retval;
- const struct cred *cred = current_cred();
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1472,7 +1473,7 @@ static int do_execve_common(const char *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+ atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext3_sync_file_enter(file, datasync);
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated state */
+ smp_rmb();
+ if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+ return -EROFS;
return 0;
-
+ }
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
- trace_ext3_invalidatepage(page, offset);
+ trace_ext3_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
@@ -1984,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7523c61f796c..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -1783,11 +1780,11 @@ retry:
inode->i_op = &ext3_file_inode_operations;
inode->i_fop = &ext3_file_operations;
ext3_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext3_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c50c76190373 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,6 +27,7 @@
#include <linux/seq_file.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
@@ -174,6 +175,11 @@ static void ext3_handle_error(struct super_block *sb)
if (test_opt (sb, ERRORS_RO)) {
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
ext3_commit_super(sb, es, 1);
@@ -291,8 +297,14 @@ void ext3_abort(struct super_block *sb, const char *function,
ext3_msg(sb, KERN_CRIT,
"error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- sb->s_flags |= MS_RDONLY;
set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+ /*
+ * Make sure updated value of ->s_mount_state will be visible
+ * before ->s_flags update.
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+
if (EXT3_SB(sb)->s_journal)
journal_abort(EXT3_SB(sb)->s_journal, -EIO);
}
@@ -808,6 +820,7 @@ enum {
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_journal_path,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -849,6 +862,7 @@ static const match_table_t tokens = {
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
@@ -964,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb,
int option;
kuid_t uid;
kgid_t gid;
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
#ifdef CONFIG_QUOTA
int qfmt;
#endif
@@ -1118,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb,
return 0;
*journal_devnum = option;
break;
+ case Opt_journal_path:
+ if (is_remount) {
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
+ return 0;
+ }
+
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext3_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return 0;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext3_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return 0;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext3_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return 0;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ break;
case Opt_noload:
set_opt (sbi->s_mount_opt, NOLOAD);
break;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
- group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- block) >>
+ group = (block -
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4af03ea84aa3..953f50481da2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
};
/*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_DIRECT 0x0004
+#define EXT4_IO_END_DIRECT 0x0002
/*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
+ atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -571,6 +561,18 @@ enum {
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
+
+/*
* Flags used by ext4_free_blocks
*/
#define EXT4_FREE_BLOCKS_METADATA 0x0001
@@ -581,11 +583,6 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
/*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
-
-/*
* ioctl commands
*/
#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -605,6 +602,7 @@ enum {
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -879,6 +877,7 @@ struct ext4_inode_info {
rwlock_t i_es_lock;
struct list_head i_es_lru;
unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -903,12 +902,22 @@ struct ext4_inode_info {
qsize_t i_reserved_quota;
#endif
- /* completed IOs that might need unwritten extents handling */
- struct list_head i_completed_io_list;
+ /* Lock protecting lists below */
spinlock_t i_completed_io_lock;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
+ struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
- struct work_struct i_unwritten_work; /* deferred extent conversion */
+ struct work_struct i_rsv_conversion_work;
+ struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1245,7 +1254,6 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
- unsigned int s_max_writeback_mb_bump;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
@@ -1281,8 +1289,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for unreserved extent convertions (dio) */
+ struct workqueue_struct *unrsv_conversion_wq;
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
/* timer for periodic error stats printing */
struct timer_list s_err_report;
@@ -1307,6 +1317,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
@@ -1342,6 +1353,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ /* Writeback has to have coversion transaction reserved */
+ WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+ !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -1374,6 +1388,7 @@ enum {
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1999,7 +2014,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2102,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2110,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2128,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2183,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
- __LINE__, ## message)
extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(4, 5)
void __ext4_abort(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
- __LINE__, ## message)
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
- __LINE__, ## message)
extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
- __LINE__, msg)
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
struct super_block *, ext4_group_t,
unsigned long, ext4_fsblk_t,
const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
- __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -2312,6 +2383,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2598,8 +2670,7 @@ struct ext4_extent;
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
- int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2680,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2626,7 +2697,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path *,
struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
+ struct ext4_ext_path *,
+ int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
@@ -2635,7 +2707,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2650,12 +2722,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2668,20 +2743,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
extern int ext4_mmp_csum_verify(struct super_block *sb,
struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
+ = BH_JBDPrivateStart,
BH_AllocFromCluster, /* allocated blocks were part of already
- * allocated cluster. Note that this flag will
- * never, ever appear in a buffer_head's state
- * flag. See EXT4_MAP_FROM_CLUSTER to see where
- * this is used. */
+ * allocated cluster. */
};
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
/*
* Wrappers for jbd2_journal_start/end.
*/
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
{
journal_t *journal;
might_sleep();
-
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
+ return -EROFS;
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
/*
* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly.
*/
- if (is_journal_aborted(journal)) {
+ if (journal && is_journal_aborted(journal)) {
ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
+ return -EROFS;
}
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return err;
}
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
+{
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, struct buffer_head *bh,
handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
#define EXT4_HT_MIGRATE 8
#define EXT4_HT_MOVE_EXTENTS 9
#define EXT4_HT_XATTR 10
-#define EXT4_HT_MAX 11
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks);
+ int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int nblocks)
+ int blocks, int rsv_blocks)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..093f5b3a69b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth)
+ int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0;
@@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line,
corrupted:
ext4_error_inode(inode, function, line, 0,
- "bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- error_msg, le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
-
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return -EIO;
}
-#define ext4_ext_check(inode, eh, depth) \
- __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk) \
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
int ext4_ext_check_inode(struct inode *inode)
{
- return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
-static int __ext4_ext_check_block(const char *function, unsigned int line,
- struct inode *inode,
- struct ext4_extent_header *eh,
- int depth,
- struct buffer_head *bh)
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+ struct inode *inode, ext4_fsblk_t pblk, int depth,
+ int flags)
{
- int ret;
+ struct buffer_head *bh;
+ int err;
- if (buffer_verified(bh))
- return 0;
- ret = ext4_ext_check(inode, eh, depth);
- if (ret)
- return ret;
+ bh = sb_getblk(inode->i_sb, pblk);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+ err = bh_submit_read(bh);
+ if (err < 0)
+ goto errout;
+ }
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+ return bh;
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
set_buffer_verified(bh);
- return ret;
+ /*
+ * If this is a leaf block, cache all of its entries
+ */
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+ struct ext4_extent_header *eh = ext_block_hdr(bh);
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev,
+ lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_uninitialized(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+ }
+ return bh;
+errout:
+ put_bh(bh);
+ return ERR_PTR(err);
+
}
-#define ext4_ext_check_block(inode, eh, depth, bh) \
- __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+#define read_extent_tree_block(inode, pblk, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+ (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh = 0;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ up_read(&ei->i_data_sem);
+ return -ENOMEM;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
struct ext4_ext_path *
ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
@@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh)) {
- ret = -ENOMEM;
+ bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+ flags);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto err;
}
- if (!bh_uptodate_or_lock(bh)) {
- trace_ext4_ext_load_extent(inode, block,
- path[ppos].p_block);
- ret = bh_submit_read(bh);
- if (ret < 0) {
- put_bh(bh);
- goto err;
- }
- }
+
eh = ext_block_hdr(bh);
ppos++;
if (unlikely(ppos > depth)) {
@@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
- i--;
-
- ret = ext4_ext_check_block(inode, eh, i, bh);
- if (ret < 0)
- goto err;
}
path[ppos].p_depth = i;
@@ -1198,7 +1293,8 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int flags,
+ unsigned int mb_flags,
+ unsigned int gb_flags,
struct ext4_ext_path *path,
struct ext4_extent *newext)
{
@@ -1220,7 +1316,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, flags, path, newext, i);
+ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
goto out;
@@ -1228,12 +1324,12 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, flags, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
if (err)
goto out;
@@ -1241,7 +1337,7 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1412,29 +1508,21 @@ got_index:
ix++;
block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
- eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_block(inode, eh,
- path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
+ bh = read_extent_tree_block(inode, block,
+ path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
ex = EXT_FIRST_EXTENT(eh);
found_extent:
*logical = le32_to_cpu(ex->ee_block);
@@ -1793,7 +1881,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext, int flag)
+ struct ext4_extent *newext, int gb_flags)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1802,7 +1890,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
- int flags = 0;
+ int mb_flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1817,7 +1905,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
/*
* Try to see whether we should rather test the extent on
@@ -1920,7 +2008,7 @@ prepend:
if (next != EXT_MAX_BLOCKS) {
ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
@@ -1939,9 +2027,10 @@ prepend:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
- flags = EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ mb_flags = EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2007,7 +2096,7 @@ has_space:
merge:
/* try to merge extents */
- if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(handle, inode, path, nearex);
@@ -2050,7 +2139,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
path = NULL;
}
- path = ext4_ext_find_extent(inode, block, path);
+ path = ext4_ext_find_extent(inode, block, path, 0);
if (IS_ERR(path)) {
up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
@@ -2125,7 +2214,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
- flags |= FIEMAP_EXTENT_DELALLOC;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
}
up_read(&EXT4_I(inode)->i_data_sem);
@@ -2328,17 +2418,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
}
/*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
*
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
*
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
*/
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
int depth;
@@ -2349,7 +2437,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
depth = ext_depth(inode);
- if (chunk)
+ if (extents <= 1)
index = depth * 2;
else
index = depth * 3;
@@ -2357,20 +2445,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
return index;
}
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- ext4_fsblk_t *partial_cluster,
+ long long *partial_cluster,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = 0;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
- else if (ext4_should_journal_data(inode))
- flags |= EXT4_FREE_BLOCKS_FORGET;
+ int flags = get_default_free_blocks_flags(inode);
/*
* For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2480,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2414,41 +2507,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/*
* If the block range to be freed didn't start at the
* beginning of a cluster, and we removed the entire
- * extent, save the partial cluster here, since we
- * might need to delete if we determine that the
- * truncate operation has removed all of the blocks in
- * the cluster.
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
*/
- if (pblk & (sbi->s_cluster_ratio - 1) &&
- (ee_len == num))
+ unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
- else
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
*partial_cluster = 0;
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* head removal */
- ext4_lblk_t num;
- ext4_fsblk_t start;
-
- num = to - from;
- start = ext4_ext_pblock(ex);
-
- ext_debug("free first %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -2461,12 +2559,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
* @start: The first block to remove
* @end: The last block to remove
*/
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2581,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2593,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
return -EIO;
}
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2622,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (pblk & (sbi->s_cluster_ratio - 1))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2707,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else
+ } else if (*partial_cluster > 0)
*partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2725,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * If there is still a entry in the leaf node, check to see if
- * it references the partial cluster. This is the only place
- * where it could; if it doesn't, we can free the cluster.
+ * Free the partial cluster only if the current extent does not
+ * reference it. Otherwise we might free used cluster.
*/
- if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ if (*partial_cluster > 0 &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2775,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- ext4_fsblk_t partial_cluster = 0;
+ long long partial_cluster = 0;
handle_t *handle;
int i = 0, err = 0;
@@ -2676,7 +2787,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
- trace_ext4_ext_remove_space(inode, start, depth);
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
* Check if we are removing extents inside the extent tree. If that
@@ -2690,7 +2801,7 @@ again:
ext4_lblk_t ee_block;
/* find extent for this block */
- path = ext4_ext_find_extent(inode, end, NULL);
+ path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2732,6 +2843,7 @@ again:
*/
err = ext4_split_extent_at(handle, inode, path,
end + 1, split_flag,
+ EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -2760,7 +2872,7 @@ again:
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
- if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
err = -EIO;
goto out;
}
@@ -2807,21 +2919,21 @@ again:
ext_debug("move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
- if (!bh) {
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(bh)) {
/* should we reset i_size? */
- err = -EIO;
+ err = PTR_ERR(bh);
break;
}
+ /* Yield here to deal with large extent trees.
+ * Should be a no-op if we did IO above. */
+ cond_resched();
if (WARN_ON(i + 1 > depth)) {
err = -EIO;
break;
}
- if (ext4_ext_check_block(inode, ext_block_hdr(bh),
- depth - i - 1, bh)) {
- err = -EIO;
- break;
- }
path[i + 1].p_bh = bh;
/* save actual number of indexes since this
@@ -2844,17 +2956,14 @@ again:
}
}
- trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
- path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
/* If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
* cluster as well. */
- if (partial_cluster && path->p_hdr->eh_entries == 0) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -3159,7 +3268,7 @@ static int ext4_split_extent(handle_t *handle,
* result in split of original leaf or extent zeroout.
*/
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3543,7 +3652,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
if (err < 0)
goto out;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4030,7 +4139,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -4242,8 +4351,8 @@ got_allocated_blocks:
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), fb_flags);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
goto out2;
}
@@ -4363,8 +4472,9 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
-
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+ ext4_es_lru_add(inode);
return err ? err : allocated;
}
@@ -4386,9 +4496,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ return;
+ }
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ ext4_std_error(inode->i_sb, err);
}
static void ext4_falloc_update_inode(struct inode *inode,
@@ -4446,7 +4567,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(file, offset, len);
+ return ext4_punch_hole(inode, offset, len);
ret = ext4_convert_inline_data(inode);
if (ret)
@@ -4548,10 +4669,9 @@ retry:
* function, to convert the fallocated extents after IO is completed.
* Returns 0 on success.
*/
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
{
- handle_t *handle;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
@@ -4566,16 +4686,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
map.m_lblk);
/*
- * credits to insert 1 extent into extent tree
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
*/
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4722,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret <= 0 || ret2 )
+ if (credits)
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
}
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
@@ -4659,7 +4798,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
error = ext4_get_inode_loc(inode, &iloc);
if (error)
return error;
- physical = iloc.bh->b_blocknr << blockbits;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
offset = EXT4_GOOD_OLD_INODE_SIZE +
EXT4_I(inode)->i_extra_isize;
physical += offset;
@@ -4667,7 +4806,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
} else { /* external block */
- physical = EXT4_I(inode)->i_file_acl << blockbits;
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
}
@@ -4719,6 +4858,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
len_blks, fieinfo);
}
-
+ ext4_es_lru_add(inode);
return error;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..00e6589ee7d2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
* Ext4 extents status tree core functions.
*/
#include <linux/rbtree.h>
+#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"
@@ -147,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
int __init ext4_init_es(void)
{
@@ -260,7 +263,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
if (tree->cache_es) {
es1 = tree->cache_es;
if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %llx\n",
+ es_debug("%u cached by [%u/%u) %llu %x\n",
lblk, es1->es_lblk, es1->es_len,
ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
@@ -291,7 +294,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
@@ -417,7 +419,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
unsigned short ee_len;
int depth, ee_status, es_status;
- path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path))
return;
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
if (in_range(es->es_lblk, ee_block, ee_len)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu we can find an extent "
"at block [%d/%d/%llu/%c], but we "
"want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (es->es_lblk < ee_block ||
ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
if (ee_status ^ es_status) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
* that we don't want to add an written/unwritten extent.
*/
if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"can't find an extent at block %d but we want "
"to add an written/unwritten extent "
"[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
* We want to add a delayed/hole extent but this
* block has been allocated.
*/
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can find blocks but we want to add a "
"delayed/hole extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
return;
} else if (ext4_es_is_written(es)) {
if (retval != es->es_len) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu retval %d != es_len %d\n",
inode->i_ino, retval, es->es_len);
return;
}
if (map.m_pblk != ext4_es_pblock(es)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu m_pblk %llu != "
"es_pblk %llu\n",
inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
}
} else if (retval == 0) {
if (ext4_es_is_written(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can't find the block but we want to add "
"an written extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -632,22 +634,20 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a extent status tree.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
*
* Return 0 on success, error code on failure.
*/
int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status)
+ unsigned int status)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
- es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
if (!len)
@@ -667,18 +667,55 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
err = __es_remove_extent(inode, lblk, end);
if (err != 0)
goto error;
+retry:
err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
}
/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status *es;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_cache_extent(inode, &newes);
+
+ if (!len)
+ return;
+
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
* ext4_es_lookup_extent() looks up an extent in extent status tree.
*
* ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
@@ -734,7 +771,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_lookup_extent_exit(inode, es, found);
return found;
}
@@ -748,8 +784,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err = 0;
+ int err;
+retry:
+ err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -784,6 +822,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (err) {
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
goto out;
}
} else {
@@ -878,38 +920,69 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
EXTENT_STATUS_WRITTEN);
}
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
- struct ext4_inode_info *ei;
- struct list_head *cur, *tmp, scanned;
- int nr_to_scan = sc->nr_to_scan;
- int ret, nr_shrunk = 0;
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
- trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-
- if (!nr_to_scan)
- return ret;
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
- INIT_LIST_HEAD(&scanned);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
+{
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skiped);
+ int ret, nr_shrunk = 0;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
spin_lock(&sbi->s_es_lru_lock);
+
+retry:
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- list_move_tail(cur, &scanned);
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
- if (ei->i_es_lru_nr == 0) {
- read_unlock(&ei->i_es_lock);
+ /*
+ * Skip the inode that is newer than the last_sorted
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+ if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+ list_move_tail(cur, &skiped);
continue;
}
- read_unlock(&ei->i_es_lock);
+
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei)
+ continue;
write_lock(&ei->i_es_lock);
ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
write_unlock(&ei->i_es_lock);
nr_shrunk += ret;
@@ -917,29 +990,71 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (nr_to_scan == 0)
break;
}
- list_splice_tail(&scanned, &sbi->s_es_lru);
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skiped, &sbi->s_es_lru);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+ * forward progress, sort the list and try again.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+ * If there are no non-precached inodes left on the
+ * list, start releasing precached extents.
+ */
+ if (ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))
+ skip_precached = 0;
+ goto retry;
+ }
+
spin_unlock(&sbi->s_es_lru_lock);
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+ return nr_shrunk;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
return ret;
}
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
- struct ext4_sb_info *sbi;
-
- sbi = EXT4_SB(sb);
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
sbi->s_es_shrinker.shrink = ext4_es_shrink;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&sbi->s_es_shrinker);
}
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+ unregister_shrinker(&sbi->s_es_shrinker);
}
void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +1062,14 @@ void ext4_es_lru_add(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
spin_lock(&sbi->s_es_lru_lock);
if (list_empty(&ei->i_es_lru))
list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
- else
- list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
}
@@ -974,10 +1092,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
struct rb_node *node;
struct extent_status *es;
int nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (ei->i_es_lru_nr == 0)
return 0;
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
node = rb_first(&tree->root);
while (node != NULL) {
es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..3e83aef3653a 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,16 +29,27 @@
/*
* These flags live in the high bits of extent_status.es_pblk
*/
-#define EXTENT_STATUS_WRITTEN (1ULL << 63)
-#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
-#define EXTENT_STATUS_DELAYED (1ULL << 61)
-#define EXTENT_STATUS_HOLE (1ULL << 60)
+#define ES_SHIFT 60
+
+#define EXTENT_STATUS_WRITTEN (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED (1 << 1)
+#define EXTENT_STATUS_HOLE (1 << 0)
#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+#define ES_WRITTEN (1ULL << 63)
+#define ES_UNWRITTEN (1ULL << 62)
+#define ES_DELAYED (1ULL << 61)
+#define ES_HOLE (1ULL << 60)
+
+#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+ ES_DELAYED | ES_HOLE)
+
+struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
@@ -59,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status);
+ unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
@@ -71,32 +85,32 @@ extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
static inline int ext4_es_is_written(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+ return (es->es_pblk & ES_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+ return (es->es_pblk & ES_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+ return (es->es_pblk & ES_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+ return (es->es_pblk & ES_HOLE) != 0;
}
-static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+static inline unsigned int ext4_es_status(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_FLAGS);
+ return es->es_pblk >> ES_SHIFT;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
- return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+ return es->es_pblk & ~ES_MASK;
}
static inline void ext4_es_store_pblock(struct extent_status *es,
@@ -104,23 +118,20 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
{
ext4_fsblk_t block;
- block = (pb & ~EXTENT_STATUS_FLAGS) |
- (es->es_pblk & EXTENT_STATUS_FLAGS);
+ block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
es->es_pblk = block;
}
static inline void ext4_es_store_status(struct extent_status *es,
- unsigned long long status)
+ unsigned int status)
{
- ext4_fsblk_t block;
-
- block = (status & EXTENT_STATUS_FLAGS) |
- (es->es_pblk & ~EXTENT_STATUS_FLAGS);
- es->es_pblk = block;
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (es->es_pblk & ~ES_MASK));
}
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 469361dbe619..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
}
last++;
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
@@ -530,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -541,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -556,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
&map, &holeoff);
if (!unwritten) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode: inode to sync
- * @datasync: only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking. This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly. The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
- int err;
- int ret;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (ret == 0)
- ret = err;
- return ret;
-}
-
/*
* akpm: A new design for ext4_sync_file().
*
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret, err;
+ int ret = 0, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_enter(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_sb->s_flags & MS_RDONLY)
- goto out;
-
- ret = ext4_flush_unwritten_io(inode);
- if (ret < 0)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
goto out;
+ }
if (!journal) {
- ret = __sync_inode(inode, datasync);
+ ret = generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!ret)
ret = err;
}
- out:
- mutex_unlock(&inode->i_mutex);
+out:
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..c6d0ce7b8078 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -625,6 +625,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk.
+ */
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_inode *raw_inode;
+ struct buffer_head *bh;
+ unsigned long dtime, now;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0, recentcy = dirty_expire_interval;
+
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (unlikely(!gdp))
+ return 0;
+
+ bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ (ino / inodes_per_block));
+ if (unlikely(!bh) || !buffer_uptodate(bh))
+ /*
+ * If the block is not in the buffer cache, then it
+ * must have been written out.
+ */
+ goto out;
+
+ offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+ dtime = le32_to_cpu(raw_inode->i_dtime);
+ now = get_seconds();
+ if (!buffer_dirty(bh) && (recentcy > 5))
+ /*
+ * Five seconds should be enough time for a block to be
+ * committed to the platter once it is sent to the HDD
+ */
+ recentcy = 5;
+
+ if (dtime && (dtime < now) && (now < dtime + recentcy))
+ ret = 1;
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -734,20 +779,23 @@ repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb), ino);
- if (ino >= EXT4_INODES_PER_GROUP(sb)) {
- if (++group == ngroups)
- group = 0;
- continue;
- }
+ if (ino >= EXT4_INODES_PER_GROUP(sb))
+ goto next_group;
if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
continue;
}
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, ino)) {
+ ino++;
+ goto next_inode;
+ }
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks);
+ handle_type, nblocks,
+ 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
@@ -766,8 +814,12 @@ repeat_in_this_group:
ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
+next_inode:
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
+next_group:
+ if (++group == ngroups)
+ group = 0;
}
err = -ENOSPC;
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map, err);
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
return err;
}
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
- mutex_lock(&inode->i_mutex);
- ext4_flush_unwritten_io(inode);
- mutex_unlock(&inode->i_mutex);
- }
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, we need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
- * 2 dindirect blocks, and 1 tindirect block
- */
- return DIV_ROUND_UP(nrblocks,
- EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
- }
/*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
*/
- indirects = nrblocks * 2 + 1;
- return indirects;
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1a346a6bdc8f..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
entry = (struct ext4_xattr_entry *)
((void *)raw_inode + EXT4_I(inode)->i_inline_off);
- free += le32_to_cpu(entry->e_value_size);
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
goto out;
}
@@ -1810,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
if (error)
goto out;
- physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..95af844eed22 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
/*
* Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
-
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
- }
- return num;
-}
-
#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
struct inode *inode,
@@ -524,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
if (es_map->m_lblk != map->m_lblk ||
es_map->m_flags != map->m_flags ||
es_map->m_pblk != map->m_pblk) {
- printk("ES cache assertation failed for inode: %lu "
+ printk("ES cache assertion failed for inode: %lu "
"es_cached ex [%d/%d/%llu/%x] != "
"found ex [%d/%d/%llu/%x] retval %d flags %x\n",
inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -575,6 +516,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -611,16 +553,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
}
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (lookup)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -712,16 +653,15 @@ found:
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (allocation)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
/*
* If the extent has been zeroed out, we don't need to update
@@ -1118,10 +1058,13 @@ static int ext4_write_end(struct file *file,
}
}
- if (ext4_has_inline_data(inode))
- copied = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- else
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1157,8 +1100,6 @@ static int ext4_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- if (copied < 0)
- ret = copied;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1415,21 +1356,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
int num_clusters;
ext4_fsblk_t lblk;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
@@ -1460,140 +1408,43 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int skip_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /*
- * skip page if block allocation undone and
- * block is dirty
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh))
- skip_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (skip_page) {
- unlock_page(page);
- continue;
- }
-
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&io_submit, page, len,
- mpd->wbc);
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t start, last;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
index = mpd->first_page;
end = mpd->next_page - 1;
-
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
pagevec_init(&pvec, 0);
while (index <= end) {
@@ -1606,14 +1457,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1494,6 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
-
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- /*
- * We're in delalloc path and it is possible that we're going to
- * need more metadata blocks than previously reserved. However
- * we must not fail because we're in writeback and there is
- * nothing we can do about it so it might result in data loss.
- * So use reserved blocks to allocate metadata if possible.
- */
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_METADATA_NOFAIL;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
-
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will just let
- * mpage_da_submit_io() unlock all of the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
-
- if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
- mpd->retval = err;
- goto submit_io;
- }
-
- /*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
- */
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
- }
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd);
-
- /* Mark this page range as having been completed */
- mpd->io_done = 1;
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
- unsigned long b_state)
-{
- sector_t next;
- int blkbits = mpd->inode->i_blkbits;
- int nrblocks = mpd->b_size >> blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= (8*1024*1024 >> blkbits))
- goto flush_it;
-
- /* check if the reserved journal credits might overflow */
- if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = 1 << blkbits;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
-
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += 1 << blkbits;
- return;
- }
-
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
- return;
-}
-
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1885,7 +1528,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
-
+ ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read((&EXT4_I(inode)->i_data_sem));
@@ -1990,16 +1633,15 @@ add_delayed:
set_buffer_delay(bh);
} else if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (lookup)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -2156,7 +1798,7 @@ out:
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
* - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1876,405 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- memset(&io_submit, 0, sizeof(io_submit));
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
ret = ext4_bio_write_page(&io_submit, page, len, wbc);
ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ unsigned long b_state)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return 0;
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = b_state & BH_FLAGS;
+ return 1;
+ }
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return 1;
+ }
+ return 0;
+}
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+ lblk >= blocks) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return false;
+ if (lblk >= blocks)
+ return true;
+ continue;
+ }
+ if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+ return false;
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Upto 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ add_page_bufs_to_extent(mpd, head, bh,
+ lblk);
+ pagevec_release(&pvec);
+ return 0;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (++lblk < blocks &&
+ (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err;
+
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an uninitialized extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ if (ext4_should_dioread_nolock(inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(handle_t *handle,
- struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- struct buffer_head *bh, *head;
- struct inode *inode = mapping->host;
- struct pagevec pvec;
- unsigned int nr_pages;
- sector_t logical;
- pgoff_t index, end;
- long nr_to_write = wbc->nr_to_write;
- int i, tag, ret = 0;
-
- memset(mpd, 0, sizeof(struct mpage_da_data));
- mpd->wbc = wbc;
- mpd->inode = inode;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ do {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ } while (map->m_len);
+
+ /* Update on-disk size after IO is submitted */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+
+ ext4_update_i_disksize(inode, disksize);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- return 0;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2318,31 +2289,21 @@ static int write_cache_pages_da(handle_t *handle,
if (page->index > end)
goto out;
- *done_index = page->index + 1;
-
- /*
- * If we can't merge this page, and we have
- * accumulated an contiguous region, write it
- */
- if ((mpd->next_page != page->index) &&
- (mpd->next_page != mpd->first_page)) {
- mpage_da_map_and_submit(mpd);
- goto ret_extent_tail;
- }
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
lock_page(page);
-
/*
- * If the page is no longer dirty, or its
- * mapping no longer corresponds to inode we
- * are writing (which means it has been
- * truncated or invalidated), or the page is
- * already under writeback and we are not
- * doing a data integrity writeback, skip the page
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
if (!PageDirty(page) ||
(PageWriteback(page) &&
- (wbc->sync_mode == WB_SYNC_NONE)) ||
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
@@ -2351,106 +2312,70 @@ static int write_cache_pages_da(handle_t *handle,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- /*
- * If we have inline data and arrive here, it means that
- * we will soon create the block for the 1st page, so
- * we'd better clear the inline data here.
- */
- if (ext4_has_inline_data(inode)) {
- BUG_ON(ext4_test_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
- }
-
- if (mpd->next_page != page->index)
+ if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
/* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate unmapped blocks
- * in the same page. Otherwise we won't make
- * progress with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) &&
- buffer_mapped(bh)) {
- /*
- * mapped dirty buffer. We need to
- * update the b_state because we look
- * at b_state in mpage_da_map_blocks.
- * We don't update b_size because if we
- * find an unmapped buffer_head later
- * we need to use the b_state flag of
- * that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state =
- bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
-
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
+ if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ goto out;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
goto out;
}
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+ mpd->next_page - mpd->first_page >=
+ mpd->wbc->nr_to_write)
+ goto out;
}
pagevec_release(&pvec);
cond_resched();
}
return 0;
-ret_extent_tail:
- ret = MPAGE_DA_EXTENT_TAIL;
out:
pagevec_release(&pvec);
- cond_resched();
- return ret;
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2385,165 @@ static int ext4_da_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+ }
+
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert upto one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
-
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
blk_start_plug(&plug);
- while (!ret && wbc->nr_to_write > 0) {
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- blk_finish_plug(&plug);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call write_cache_pages_da() to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4 and submit them.
- */
- ret = write_cache_pages_da(handle, mapping,
- wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * Got one extent now try with rest of the pages.
- * If mpd.retval is set -EIO, journal is aborted.
- * So we don't need to write any more.
- */
- pages_written += mpd.pages_written;
- ret = mpd.retval;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
blk_finish_plug(&plug);
- if (!io_done && !cycled) {
+ if (!ret && !cycled) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
@@ -2829,7 +2755,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -2838,10 +2765,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -2864,7 +2791,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -2989,37 +2916,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- trace_ext4_invalidatepage(page, offset);
+ trace_ext4_invalidatepage(page, offset, length);
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
- block_invalidatepage(page, offset);
+ block_invalidatepage(page, offset, length);
}
static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- return jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
}
/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +2997,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end) {
+ inode_dio_done(inode);
+ if (is_async)
+ aio_complete(iocb, ret, 0);
+ return;
+ }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3011,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
-
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
-out:
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
io_end->offset = offset;
io_end->size = size;
if (is_async) {
io_end->iocb = iocb;
io_end->result = ret;
}
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
/*
@@ -3129,6 +3051,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3059,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
BUG_ON(iocb->private == NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
if (overwrite) {
- atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3167,13 +3097,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) {
ret = -ENOMEM;
goto retake_lock;
}
io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
/*
* we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3130,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
dio_flags);
- if (iocb->private)
- ext4_inode_aio_set(inode, NULL);
/*
- * The io_end structure takes a reference to the inode, that
- * structure needs to be destroyed and the reference to the
- * inode need to be dropped, when IO is complete, even with 0
- * byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will
- * be destroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since VFS
- * direct IO won't invoke the end_io call back function, we
- * need to free the end_io structure here.
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->iocb);
+ /*
+ * Generic code already did inode_dio_done() so we
+ * have to clear EXT4_IO_END_DIRECT to not do it for
+ * the second time.
+ */
+ io_end->flag = 0;
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
+ }
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
* completed, we could do the conversion right here
*/
- err = ext4_convert_unwritten_extents(inode,
+ err = ext4_convert_unwritten_extents(NULL, inode,
offset, ret);
if (err < 0)
ret = err;
@@ -3231,9 +3173,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
}
retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
- inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -3292,6 +3235,7 @@ static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.bmap = ext4_bmap,
@@ -3307,6 +3251,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3267,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3355,89 +3300,56 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-
/*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
struct inode *inode = mapping->host;
- struct page *page;
- int err = 0;
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
-
- err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
- from, length, flags);
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- unlock_page(page);
- page_cache_release(page);
- return err;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped. Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been locked. The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'. This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode: The files inode
- * page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the beginning of the file)
- * to begin discarding
- * len: The length of bytes to discard
- * flags: Optional flags that may be used:
- *
- * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- * Only zero the regions of the page whose buffer heads
- * have already been unmapped. This flag is appropriate
- * for updating the contents of a page whose blocks may
- * have already been released, and we only want to zero
- * out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned int offset = from & (PAGE_CACHE_SIZE-1);
- unsigned int blocksize, max, pos;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
+ struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
- blocksize = inode->i_sb->s_blocksize;
- max = PAGE_CACHE_SIZE - offset;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
- if (index != page->index)
- return -EINVAL;
+ blocksize = inode->i_sb->s_blocksize;
+ max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
- * 'from' and the end of the page
+ * 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
@@ -3455,106 +3367,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- pos = offset;
- while (pos < offset + length) {
- unsigned int end_of_block, range_to_discard;
-
- err = 0;
-
- /* The length of space left to zero and unmap */
- range_to_discard = offset + length - pos;
-
- /* The length of space until the end of the block */
- end_of_block = blocksize - (pos & (blocksize-1));
-
- /*
- * Do not unmap or zero past end of block
- * for this buffer head
- */
- if (range_to_discard > end_of_block)
- range_to_discard = end_of_block;
-
-
- /*
- * Skip this buffer head if we are only zeroing unampped
- * regions of the page
- */
- if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
- buffer_mapped(bh))
- goto next;
-
- /* If the range is block aligned, unmap */
- if (range_to_discard == blocksize) {
- clear_buffer_dirty(bh);
- bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
- clear_buffer_uptodate(bh);
- zero_user(page, pos, range_to_discard);
- BUFFER_TRACE(bh, "Buffer discarded");
- goto next;
- }
-
- /*
- * If this block is not completely contained in the range
- * to be discarded, then it is not going to be released. Because
- * we need to keep this block, we need to make sure this part
- * of the page is uptodate before we modify it by writeing
- * partial zeros on it.
- */
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto unlock;
+ }
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh)) {
- /*
- * Buffer head must be mapped before we can read
- * from the block
- */
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto next;
- }
+ BUFFER_TRACE(bh, "still unmapped");
+ goto unlock;
}
+ }
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt.*/
- if (!buffer_uptodate(bh))
- goto next;
- }
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+ zero_user(page, offset, length);
+ BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto next;
- }
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else {
+ err = 0;
+ mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
+ }
- zero_user(page, pos, range_to_discard);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
- bh = bh->b_this_page;
- iblock++;
- pos += range_to_discard;
- }
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
+
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
return err;
}
@@ -3580,14 +3477,12 @@ int ext4_can_truncate(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
+ loff_t first_block_offset, last_block_offset;
handle_t *handle;
unsigned int credits;
int ret = 0;
@@ -3638,23 +3533,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
offset;
}
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
- ret = ext4_flush_unwritten_io(inode);
- if (ret)
- goto out_dio;
inode_dio_wait(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3556,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out_dio;
}
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the
- * buffer heads for the block aligned regions of the page that
- * were completely zeroed.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained
- * within a page just zero out and unmap the middle of
- * that page
- */
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
-
- if (ret)
- goto out_stop;
- } else {
- /*
- * zero out and unmap the partial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
-
- /*
- * zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
- }
-
- /*
- * If i_size is contained in the last page, we need to
- * unmap and zero the partial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (ret)
- goto out_stop;
- }
- }
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3635,6 @@ void ext4_truncate(struct inode *inode)
unsigned int credits;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- loff_t page_len;
/*
* There is a possibility that we're either freeing the inode
@@ -3830,12 +3661,6 @@ void ext4_truncate(struct inode *inode)
return;
}
- /*
- * finish any pending end_io work so we won't run the risk of
- * converting any truncated blocks to initialized later
- */
- ext4_flush_unwritten_io(inode);
-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
@@ -3847,14 +3672,8 @@ void ext4_truncate(struct inode *inode)
return;
}
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0))
- goto out_stop;
- }
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
* We add the inode to the orphan list, so that if this
@@ -4623,7 +4442,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
inode->i_size >> PAGE_CACHE_SHIFT);
if (!page)
return;
- ret = __ext4_journalled_invalidatepage(page, offset);
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
unlock_page(page);
page_cache_release(page);
if (ret != -EBUSY)
@@ -4805,7 +4625,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
@@ -4823,15 +4643,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
EXT4_I(inode)->i_reserved_data_blocks);
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
return 0;
}
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_ind_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -4845,7 +4666,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -4853,14 +4675,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -4868,12 +4686,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -4904,7 +4717,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..3c7304c6370c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -622,6 +622,8 @@ resizefs_out:
return 0;
}
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
default:
return -ENOTTY;
@@ -686,6 +688,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT:
case FITRIM:
case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
+ cond_resched();
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp) {
- ext4_discard_allocated_blocks(ac);
- goto errout;
- }
+ if (*errp)
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
@@ -4735,11 +4740,16 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
+ retry:
new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
- ext4_mb_unload_buddy(&e4b);
- err = -ENOMEM;
- goto error_return;
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
}
new_entry->efd_start_cluster = bit;
new_entry->efd_group = block_group;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 49e8bdff9163..f99bdb8548b2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..7fa4d855dbd5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
int ret = 0;
struct ext4_ext_path *path;
- path = ext4_ext_find_extent(inode, lblock, *orig_path);
+ path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
ret = PTR_ERR(path);
else if (path[ext_depth(inode)].p_ext == NULL)
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
- long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f91002f8c017..1bec5a5c1e45 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse(bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
@@ -2319,11 +2316,11 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
if (handle)
@@ -3008,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
+ *
+ * n.b. old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- handle_t *handle;
+ handle_t *handle = NULL;
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval, force_da_alloc = 0;
+ int retval;
int inlined = 0, new_inlined = 0;
struct ext4_dir_entry_2 *parent_de;
@@ -3029,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* in separate transaction */
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
/*
@@ -3059,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new_bh = NULL;
}
}
+ if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old_inode);
+
+ handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ ext4_handle_sync(handle);
+
if (S_ISDIR(old_inode->i_mode)) {
if (new_inode) {
retval = -ENOTEMPTY;
@@ -3189,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext4_orphan_add(handle, new_inode);
- if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- force_da_alloc = 1;
}
retval = 0;
@@ -3198,9 +3201,8 @@ end_rename:
brelse(dir_bh);
brelse(old_bh);
brelse(new_bh);
- ext4_journal_stop(handle);
- if (retval == 0 && force_da_alloc)
- ext4_alloc_da_blocks(old_inode);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -46,46 +47,121 @@ void ext4_exit_pageio(void)
}
/*
- * This function is called by ext4_evict_inode() to make sure there is
- * no more pending I/O completion work left to do.
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
*/
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
+{
+ char b[BDEVNAME_SIZE];
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
- /*
- * We need to make sure the work structure is finished being
- * used before we let the inode get destroyed.
- */
- if (work_pending(&EXT4_I(inode)->i_unwritten_work))
- cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ struct bio_vec *bvec = &bio->bi_io_vec[i];
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
+ }
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
+
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+ if (io_end->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(io_end->inode);
+ if (io_end->iocb)
+ aio_complete(io_end->iocb, io_end->result, 0);
+ kmem_cache_free(io_end_cachep, io_end);
}
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- BUG_ON(!io);
- BUG_ON(!list_empty(&io->list));
- BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+ struct inode *inode = io_end->inode;
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io->inode));
- kmem_cache_free(io_end_cachep, io);
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
}
-/* check a range of space and convert unwritten extents to written. */
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- ret = ext4_convert_unwritten_extents(inode, offset, size);
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
@@ -93,30 +169,22 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
return ret;
}
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
- ext4_debug("inode %lu completed_io list is empty\n",
- inode->i_ino);
+ if (list_empty(head))
return;
- }
- ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +198,30 @@ static void dump_completed_IO(struct inode *inode)
}
/* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
unsigned long flags;
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list))
- queue_work(wq, &ei->i_unwritten_work);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ if (io_end->handle) {
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ } else {
+ wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+ if (list_empty(&ei->i_unrsv_conversion_list))
+ queue_work(wq, &ei->i_unrsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+ }
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
{
ext4_io_end_t *io;
struct list_head unwritten;
@@ -155,8 +230,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
int err, ret = 0;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- dump_completed_IO(inode);
- list_replace_init(&ei->i_completed_io_list, &unwritten);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
@@ -167,30 +242,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
- ext4_free_io_end(io);
}
return ret;
}
/*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
*/
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
{
struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unwritten_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode);
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
{
- int ret;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
- !(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode);
- ext4_unwritten_wait(inode);
- return ret;
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unrsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +270,59 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
}
return io;
}
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c. This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message. We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
}
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
+}
+
+/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct inode *inode;
- int i;
- int blocksize;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
- inode = io_end->inode;
- blocksize = 1 << inode->i_blkbits;
- bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct bio_vec *bvec = &bio->bi_io_vec[i];
- struct page *page = bvec->bv_page;
- struct buffer_head *bh, *head;
- unsigned bio_start = bvec->bv_offset;
- unsigned bio_end = bio_start + bvec->bv_len;
- unsigned under_io = 0;
- unsigned long flags;
-
- if (!page)
- continue;
-
- if (error) {
- SetPageError(page);
- set_bit(AS_EIO, &page->mapping->flags);
- }
- bh = head = page_buffers(page);
- /*
- * We check all buffers in the page under BH_Uptodate_Lock
- * to avoid races with other end io clearing async_write flags
- */
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
- do {
- if (bh_offset(bh) < bio_start ||
- bh_offset(bh) + blocksize > bio_end) {
- if (buffer_async_write(bh))
- under_io++;
- continue;
- }
- clear_buffer_async_write(bh);
- if (error)
- buffer_io_error(bh);
- } while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
- if (!under_io)
- end_page_writeback(page);
- }
- bio_put(bio);
if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
+ struct inode *inode = io_end->inode;
+
ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
inode->i_ino,
@@ -286,12 +332,23 @@ static void ext4_end_bio(struct bio *bio, int error)
bi_sector >> (inode->i_blkbits - 9));
}
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- return;
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
}
-
- ext4_add_complete_io(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +362,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio);
}
io->io_bio = NULL;
- io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
io->io_end = NULL;
}
-static int io_submit_init(struct ext4_io_submit *io,
- struct inode *inode,
- struct writeback_control *wbc,
- struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
+ return -ENOMEM;
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
-
- io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+ bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
- struct writeback_control *wbc,
struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
int ret;
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +401,14 @@ submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init(io, inode, wbc, bh);
+ ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
- io_end = io->io_end;
- if (test_clear_buffer_uninit(bh))
- ext4_set_io_unwritten_flag(inode, io_end);
- io->io_end->size += bh->b_size;
- io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
+ io->io_next_block++;
return 0;
}
@@ -432,7 +480,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, wbc, bh);
+ ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_group_overhead_blocks(sb, group);
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, "Cannot add at group %u (only %u groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
+ if (offset != 0)
ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
struct inode *inode = NULL;
- int gdb_off, gdb_num;
+ int gdb_off;
int err;
__u16 bg_flags = 0;
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
err = err2;
if (!err) {
- ext4_fsblk_t first_block;
- first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..36b141e420b7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
}
if (test_opt(sb, ERRORS_RO)) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
ext4_handle_error(sb);
}
-void ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
ext4_handle_error(inode->i_sb);
}
-void ext4_error_file(struct file *file, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
if ((sb->s_flags & MS_RDONLY) == 0) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
panic("EXT4-fs panic from previous error\n");
}
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->dio_unwritten_wq);
- destroy_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->unrsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0;
#endif
ei->jinode = NULL;
- INIT_LIST_HEAD(&ei->i_completed_io_list);
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
- INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
+ .sync_fs = ext4_sync_fs_nojournal,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1684,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
#endif
}
@@ -1908,7 +1920,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group;
- unsigned int groups_per_flex = 0;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1927,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
sbi->s_log_groups_per_flex = 0;
return 1;
}
- groups_per_flex = 1U << sbi->s_log_groups_per_flex;
err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
if (err)
@@ -2164,19 +2174,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
dquot_initialize(inode);
if (inode->i_nlink) {
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -2377,7 +2390,10 @@ struct ext4_attr {
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
- int offset;
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
};
static int parse_strtoull(const char *buf,
@@ -2446,7 +2462,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
static ssize_t sbi_ui_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -2455,7 +2471,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
unsigned long t;
int ret;
@@ -2504,12 +2520,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
return count;
}
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct ext4_sb_info, _elname), \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
}
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2544,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2566,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -3586,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
- /* Do we have standard group size of blocksize * 8 blocks ? */
- if (sbi->s_blocks_per_group == blocksize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
-
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3659,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
/*
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
@@ -3763,7 +3795,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
- ext4_es_register_shrinker(sb);
+ ext4_es_register_shrinker(sbi);
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
@@ -3787,7 +3819,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
- sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -3915,12 +3946,20 @@ no_journal:
* The maximum number of concurrent works can be high and
* concurrency isn't really necessary. Limit it to 1.
*/
- EXT4_SB(sb)->dio_unwritten_wq =
- alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->dio_unwritten_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
ret = -ENOMEM;
- goto failed_mount_wq;
+ goto failed_mount4;
+ }
+
+ EXT4_SB(sb)->unrsv_conversion_wq =
+ alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
}
/*
@@ -4074,14 +4113,17 @@ failed_mount4a:
sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ if (EXT4_SB(sb)->unrsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4559,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
*/
dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(sbi->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
+ }
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
}
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
return ret;
}
@@ -5406,6 +5481,7 @@ static void __exit ext4_exit_fs(void)
kset_unregister(ext4_kset);
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_es();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+ bool "F2FS Security Labels"
+ depends on F2FS_FS_XATTR
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the f2fs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size);
+ error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
kfree(value);
if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..c5a5c390a5cc 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = {
.set_page_dirty = f2fs_set_meta_page_dirty,
};
-int check_orphan_space(struct f2fs_sb_info *sbi)
+int acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
unsigned int max_orphans;
int err = 0;
@@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi)
mutex_lock(&sbi->orphan_inode_mutex);
if (sbi->n_orphans >= max_orphans)
err = -ENOSPC;
+ else
+ sbi->n_orphans++;
mutex_unlock(&sbi->orphan_inode_mutex);
return err;
}
+void release_orphan_inode(struct f2fs_sb_info *sbi)
+{
+ mutex_lock(&sbi->orphan_inode_mutex);
+ sbi->n_orphans--;
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct list_head *head, *this;
@@ -229,21 +238,18 @@ retry:
list_add(&new->list, this->prev);
else
list_add_tail(&new->list, head);
-
- sbi->n_orphans++;
out:
mutex_unlock(&sbi->orphan_inode_mutex);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct list_head *this, *next, *head;
+ struct list_head *head;
struct orphan_inode_entry *orphan;
mutex_lock(&sbi->orphan_inode_mutex);
head = &sbi->orphan_inode_list;
- list_for_each_safe(this, next, head) {
- orphan = list_entry(this, struct orphan_inode_entry, list);
+ list_for_each_entry(orphan, head, list) {
if (orphan->ino == ino) {
list_del(&orphan->list);
kmem_cache_free(orphan_entry_slab, orphan);
@@ -357,8 +363,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
unsigned long blk_size = sbi->blocksize;
struct f2fs_checkpoint *cp_block;
unsigned long long cur_version = 0, pre_version = 0;
- unsigned int crc = 0;
size_t crc_offset;
+ __u32 crc = 0;
/* Read the 1st cp block in this CP pack */
cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +375,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp1;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
@@ -384,7 +390,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp2;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
@@ -450,13 +456,30 @@ fail_no_cp:
return -EINVAL;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct list_head *head = &sbi->dir_inode_list;
- struct dir_inode_entry *new;
struct list_head *this;
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode)
+ return -EEXIST;
+ }
+ list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+ sbi->n_dirty_dirs++;
+#endif
+ return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+
if (!S_ISDIR(inode->i_mode))
return;
retry:
@@ -469,23 +492,31 @@ retry:
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode) {
- kmem_cache_free(inode_entry_slab, new);
- goto out;
- }
- }
- list_add_tail(&new->list, head);
- sbi->n_dirty_dirs++;
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
- BUG_ON(!S_ISDIR(inode->i_mode));
-out:
inc_page_count(sbi, F2FS_DIRTY_DENTS);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void add_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+ spin_lock(&sbi->dir_inode_lock);
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
spin_unlock(&sbi->dir_inode_lock);
}
@@ -499,8 +530,10 @@ void remove_dirty_dir_inode(struct inode *inode)
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents))
- goto out;
+ if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
list_for_each(this, head) {
struct dir_inode_entry *entry;
@@ -508,12 +541,38 @@ void remove_dirty_dir_inode(struct inode *inode)
if (entry->inode == inode) {
list_del(&entry->list);
kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->n_dirty_dirs--;
+#endif
+ break;
+ }
+ }
+ spin_unlock(&sbi->dir_inode_lock);
+
+ /* Only from the recovery routine */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+ clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ iput(inode);
+ }
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+ struct inode *inode = NULL;
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode->i_ino == ino) {
+ inode = entry->inode;
break;
}
}
-out:
spin_unlock(&sbi->dir_inode_lock);
+ return inode;
}
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +654,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
- unsigned int crc32 = 0;
+ __u32 crc32 = 0;
void *kaddr;
int i;
@@ -664,8 +723,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
- *(__le32 *)((unsigned char *)ckpt +
- le32_to_cpu(ckpt->checksum_offset))
+ *((__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..a7eb52925723 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, false);
- rn = (struct f2fs_node *)page_address(node_page);
+ rn = F2FS_NODE(node_page);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
return 0;
}
+#ifdef CONFIG_F2FS_STAT_FS
sbi->total_hit_ext++;
+#endif
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
else
bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
sbi->read_hit_ext++;
+#endif
read_unlock(&fi->ext.ext_lock);
return 1;
}
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
if (dn.data_blkaddr == NEW_ADDR)
return ERR_PTR(-EINVAL);
- page = grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
struct page *page;
int err;
+repeat:
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
+ }
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR)
+ if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ }
if (PageUptodate(page))
return page;
@@ -274,9 +285,10 @@ repeat:
*
* Also, caller should grab and release a mutex by calling mutex_lock_op() and
* mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
*/
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
- bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+ struct page *npage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ set_new_dnode(&dn, inode, npage, npage, 0);
err = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err)
return ERR_PTR(err);
if (dn.data_blkaddr == NULL_ADDR) {
if (reserve_new_block(&dn)) {
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
return ERR_PTR(-ENOSPC);
}
}
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
repeat:
page = grab_cache_page(mapping, index);
if (!page)
@@ -325,6 +339,8 @@ repeat:
if (new_i_size &&
i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ /* Only the directory inode sets new_i_size */
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
mark_inode_dirty_sync(inode);
}
return page;
@@ -349,7 +365,6 @@ static void read_end_io(struct bio *bio, int err)
}
unlock_page(page);
} while (bvec >= bio->bi_io_vec);
- kfree(bio->bi_private);
bio_put(bio);
}
@@ -375,7 +390,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
bio->bi_end_io = read_end_io;
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
- kfree(bio->bi_private);
bio_put(bio);
up_read(&sbi->bio_sem);
f2fs_put_page(page, 1);
@@ -481,8 +495,9 @@ int do_write_data_page(struct page *page)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
- need_inplace_update(inode)) {
+ if (unlikely(old_blk_addr != NEW_ADDR &&
+ !is_cold_data(page) &&
+ need_inplace_update(inode))) {
rewrite_data_page(F2FS_SB(inode->i_sb), page,
old_blk_addr);
} else {
@@ -619,9 +634,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
int err = 0;
int ilock;
- /* for nobh_write_end */
- *fsdata = NULL;
-
f2fs_balance_fs(sbi);
repeat:
page = grab_cache_page_write_begin(mapping, index, flags);
@@ -684,6 +696,27 @@ err:
return err;
}
+static int f2fs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ if (pos + copied > i_size_read(inode)) {
+ i_size_write(inode, pos + copied);
+ mark_inode_dirty(inode);
+ update_inode_page(inode);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
+}
+
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
@@ -698,7 +731,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
get_data_block_ro);
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +774,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
.invalidatepage = f2fs_invalidate_data_page,
.releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..a84b0a8e6854 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -29,7 +29,7 @@ static DEFINE_MUTEX(f2fs_stat_mutex);
static void update_general_status(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
int i;
/* valid check of the segment numbers */
@@ -83,7 +83,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
*/
static void update_sit_info(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno, vblocks;
@@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
*/
static void update_mem_info(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned npages;
if (si->base_mem)
@@ -175,12 +175,12 @@ get_cache:
static int stat_show(struct seq_file *s, void *v)
{
- struct f2fs_stat_info *si, *next;
+ struct f2fs_stat_info *si;
int i = 0;
int j;
mutex_lock(&f2fs_stat_mutex);
- list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ list_for_each_entry(si, &f2fs_stat_list, stat_list) {
char devname[BDEVNAME_SIZE];
update_general_status(si->sbi);
@@ -253,21 +253,21 @@ static int stat_show(struct seq_file *s, void *v)
si->nats, NM_WOUT_THRESHOLD);
seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
si->sits, si->fnids);
- seq_printf(s, "\nDistribution of User Blocks:");
- seq_printf(s, " [ valid | invalid | free ]\n");
- seq_printf(s, " [");
+ seq_puts(s, "\nDistribution of User Blocks:");
+ seq_puts(s, " [ valid | invalid | free ]\n");
+ seq_puts(s, " [");
for (j = 0; j < si->util_valid; j++)
- seq_printf(s, "-");
- seq_printf(s, "|");
+ seq_putc(s, '-');
+ seq_putc(s, '|');
for (j = 0; j < si->util_invalid; j++)
- seq_printf(s, "-");
- seq_printf(s, "|");
+ seq_putc(s, '-');
+ seq_putc(s, '|');
for (j = 0; j < si->util_free; j++)
- seq_printf(s, "-");
- seq_printf(s, "]\n\n");
+ seq_putc(s, '-');
+ seq_puts(s, "]\n\n");
seq_printf(s, "SSR: %u blocks in %u segments\n",
si->block_count[SSR], si->segment_count[SSR]);
seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -305,11 +305,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
- sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
- if (!sbi->stat_info)
+ si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+ if (!si)
return -ENOMEM;
- si = sbi->stat_info;
si->all_area_segs = le32_to_cpu(raw_super->segment_count);
si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -319,6 +318,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
si->sbi = sbi;
+ sbi->stat_info = si;
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
@@ -329,13 +329,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
mutex_lock(&f2fs_stat_mutex);
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
- kfree(sbi->stat_info);
+ kfree(si);
}
void __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 600bb5efe603..384c6daf9a89 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
#include "f2fs.h"
#include "node.h"
#include "acl.h"
+#include "xattr.h"
static unsigned long dir_blocks(struct inode *inode)
{
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page = NULL;
- struct f2fs_dir_entry *de = NULL;
- struct f2fs_dentry_block *dentry_blk = NULL;
+ struct page *page;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_block *dentry_blk;
page = get_lock_data_page(dir, 0);
if (IS_ERR(page))
@@ -264,30 +265,41 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_put_page(page, 1);
}
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_node *rn;
- if (IS_ERR(ipage))
- return;
-
- wait_on_page_writeback(ipage);
-
/* copy name info. to this inode page */
- rn = (struct f2fs_node *)page_address(ipage);
+ rn = F2FS_NODE(ipage);
rn->i.i_namelen = cpu_to_le32(name->len);
memcpy(rn->i.i_name, name->name, name->len);
set_page_dirty(ipage);
}
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+int update_dent_inode(struct inode *inode, const struct qstr *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ init_dent_inode(name, page);
+ f2fs_put_page(page, 1);
+
+ return 0;
+}
+
+static int make_empty_dir(struct inode *inode,
+ struct inode *parent, struct page *page)
{
struct page *dentry_page;
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dir_entry *de;
void *kaddr;
- dentry_page = get_new_data_page(inode, 0, true);
+ dentry_page = get_new_data_page(inode, page, 0, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -317,63 +329,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
return 0;
}
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
+ struct page *page;
+ int err;
+
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- int err;
- err = new_inode_page(inode, name);
- if (err)
- return err;
+ page = new_inode_page(inode, name);
+ if (IS_ERR(page))
+ return page;
if (S_ISDIR(inode->i_mode)) {
- err = make_empty_dir(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ err = make_empty_dir(inode, dir, page);
+ if (err)
+ goto error;
}
err = f2fs_init_acl(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ if (err)
+ goto error;
+
+ err = f2fs_init_security(inode, dir, name, page);
+ if (err)
+ goto error;
+
+ wait_on_page_writeback(page);
} else {
- struct page *ipage;
- ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- set_cold_node(inode, ipage);
- init_dent_inode(name, ipage);
- f2fs_put_page(ipage, 1);
+ page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(page))
+ return page;
+
+ wait_on_page_writeback(page);
+ set_cold_node(inode, page);
}
+
+ init_dent_inode(name, page);
+
+ /*
+ * This file should be checkpointed during fsync.
+ * We lost i_pino from now on.
+ */
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ file_lost_pino(inode);
inc_nlink(inode);
- update_inode_page(inode);
}
- return 0;
+ return page;
+
+error:
+ f2fs_put_page(page, 1);
+ remove_inode_page(inode);
+ return ERR_PTR(err);
}
static void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- bool need_dir_update = false;
-
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
if (S_ISDIR(inode->i_mode)) {
inc_nlink(dir);
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
if (F2FS_I(dir)->i_current_depth != current_depth) {
F2FS_I(dir)->i_current_depth = current_depth;
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (need_dir_update)
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
update_inode_page(dir);
else
mark_inode_dirty(dir);
@@ -423,6 +448,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
int slots = GET_DENTRY_SLOTS(namelen);
+ struct page *page;
int err = 0;
int i;
@@ -448,7 +474,7 @@ start:
bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- dentry_page = get_new_data_page(dir, block, true);
+ dentry_page = get_new_data_page(dir, NULL, block, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -465,12 +491,13 @@ start:
++level;
goto start;
add_dentry:
- err = init_inode_metadata(inode, dir, name);
- if (err)
- goto fail;
-
wait_on_page_writeback(dentry_page);
+ page = init_inode_metadata(inode, dir, name);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
@@ -481,11 +508,14 @@ add_dentry:
test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
set_page_dirty(dentry_page);
- update_parent_metadata(dir, inode, current_depth);
-
- /* update parent inode number before releasing dentry page */
+ /* we don't need to mark_inode_dirty now */
F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+
+ update_parent_metadata(dir, inode, current_depth);
fail:
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
@@ -542,6 +572,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (inode->i_nlink == 0)
add_orphan_inode(sbi, inode->i_ino);
+ else
+ release_orphan_inode(sbi);
}
if (bit_pos == NR_DENTRY_IN_BLOCK) {
@@ -595,13 +627,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned int bit_pos = 0, start_bit_pos = 0;
+ unsigned int bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
unsigned char d_type = DT_UNKNOWN;
- int slots;
bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
@@ -610,7 +641,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(dentry_page))
continue;
- start_bit_pos = bit_pos;
dentry_blk = kmap(dentry_page);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -619,19 +649,19 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (bit_pos >= NR_DENTRY_IN_BLOCK)
break;
- ctx->pos += bit_pos - start_bit_pos;
de = &dentry_blk->dentry[bit_pos];
if (de->file_type < F2FS_FT_MAX)
d_type = f2fs_filetype_table[de->file_type];
else
d_type = DT_UNKNOWN;
if (!dir_emit(ctx,
- dentry_blk->filename[bit_pos],
- le16_to_cpu(de->name_len),
- le32_to_cpu(de->ino), d_type))
- goto success;
- slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- bit_pos += slots;
+ dentry_blk->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ le32_to_cpu(de->ino), d_type))
+ goto stop;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
}
bit_pos = 0;
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -639,7 +669,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
f2fs_put_page(dentry_page, 1);
dentry_page = NULL;
}
-success:
+stop:
if (dentry_page && !IS_ERR(dentry_page)) {
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..d8e386ceb0cd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/crc32.h>
#include <linux/magic.h>
+#include <linux/kobject.h>
/*
* For mount options
@@ -37,21 +38,35 @@
typecheck(unsigned long long, b) && \
((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t; /*
+ * should not change u32, since it is the on-disk block
+ * address format, __le32.
+ */
typedef u32 nid_t;
struct f2fs_mount_info {
unsigned int opt;
};
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
{
- return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+ unsigned char *p = (unsigned char *)buf;
+ __u32 crc = F2FS_SUPER_MAGIC;
+ int i;
+
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+ }
+ return crc;
}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
{
- return f2fs_crc32(buff, buff_size) == blk_crc;
+ return f2fs_crc32(buf, buf_size) == blk_crc;
}
/*
@@ -148,7 +163,7 @@ struct extent_info {
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
#define FADVISE_COLD_BIT 0x01
-#define FADVISE_CP_BIT 0x02
+#define FADVISE_LOST_PINO_BIT 0x02
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
@@ -336,6 +351,7 @@ enum page_type {
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
+ struct proc_dir_entry *s_proc; /* proc entry */
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int s_dirty; /* dirty flag for checkpoint */
@@ -369,7 +385,6 @@ struct f2fs_sb_info {
/* for directory inode management */
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- unsigned int n_dirty_dirs; /* # of dir inodes */
/* basic file system units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,13 +421,20 @@ struct f2fs_sb_info {
* for stat information.
* one is for the LFS mode, and the other is for the SSR mode.
*/
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
- unsigned int last_victim[2]; /* last victim segment # */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
int bg_gc; /* background gc calls */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+#endif
+ unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
+
+ /* For sysfs suppport */
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
};
/*
@@ -438,6 +460,11 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
return (struct f2fs_checkpoint *)(sbi->ckpt);
}
+static inline struct f2fs_node *F2FS_NODE(struct page *page)
+{
+ return (struct f2fs_node *)page_address(page);
+}
+
static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
{
return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -495,9 +522,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
{
- int i = 0;
- for (; i < NR_GLOBAL_LOCKS; i++)
- mutex_lock(&sbi->fs_lock[i]);
+ int i;
+
+ for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+ /*
+ * This is the only time we take multiple fs_lock[]
+ * instances; the order is immaterial since we
+ * always hold cp_mutex, which serializes multiple
+ * such operations.
+ */
+ mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+ }
}
static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -788,7 +823,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
static inline bool IS_INODE(struct page *page)
{
- struct f2fs_node *p = (struct f2fs_node *)page_address(page);
+ struct f2fs_node *p = F2FS_NODE(page);
return RAW_IS_INODE(p);
}
@@ -802,7 +837,7 @@ static inline block_t datablock_addr(struct page *node_page,
{
struct f2fs_node *raw_node;
__le32 *addr_array;
- raw_node = (struct f2fs_node *)page_address(node_page);
+ raw_node = F2FS_NODE(node_page);
addr_array = blkaddr_in_node(raw_node);
return le32_to_cpu(addr_array[offset]);
}
@@ -843,9 +878,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_UPDATE_DIR, /* should update inode block for consistency */
+ FI_DELAY_IPUT, /* used for the recovery */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +916,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
return 0;
}
+static inline int f2fs_readonly(struct super_block *sb)
+{
+ return sb->s_flags & MS_RDONLY;
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
long f2fs_ioctl(struct file *, unsigned int, unsigned long);
long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -913,7 +958,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
+int update_dent_inode(struct inode *, const struct qstr *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +993,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
@@ -974,13 +1019,13 @@ void destroy_node_manager_caches(void);
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
void clear_prefree_segments(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
+void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
block_t, block_t *);
@@ -1005,13 +1050,16 @@ void destroy_segment_manager(struct f2fs_sb_info *);
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-int check_orphan_space(struct f2fs_sb_info *);
+int acquire_orphan_inode(struct f2fs_sb_info *);
+void release_orphan_inode(struct f2fs_sb_info *);
void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1073,7 @@ int reserve_new_block(struct dnode_of_data *);
void update_extent_cache(block_t, struct dnode_of_data *);
struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
int do_write_data_page(struct page *);
@@ -1078,11 +1126,16 @@ struct f2fs_stat_info {
unsigned base_mem, cache_mem;
};
+static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_stat_info*)sbi->stat_info;
+}
+
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_seg_count(sbi, type) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
(si)->tot_segs++; \
if (type == SUM_TYPE_DATA) \
si->data_segs++; \
@@ -1095,14 +1148,14 @@ struct f2fs_stat_info {
#define stat_inc_data_blk_count(sbi, blks) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
} while (0)
#define stat_inc_node_blk_count(sbi, blks) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
} while (0)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..c2deb27ffb72 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
+ file_update_time(vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) ||
+ page_offset(page) > i_size_read(inode) ||
!PageUptodate(page)) {
unlock_page(page);
err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
* check to see if the page is mapped already (no holes)
*/
if (PageMappedToDisk(page))
- goto out;
-
- /* fill the page */
- wait_on_page_writeback(page);
+ goto mapped;
/* page is wholly or partially inside EOF */
if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
set_page_dirty(page);
SetPageUptodate(page);
- file_update_time(vma->vm_file);
+mapped:
+ /* fill the page */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -102,6 +102,26 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.remap_pages = generic_file_remap_pages,
};
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+ struct dentry *dentry;
+
+ inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ iput(inode);
+ if (!dentry)
+ return 0;
+
+ if (update_dent_inode(inode, &dentry->d_name)) {
+ dput(dentry);
+ return 0;
+ }
+
+ *pino = parent_ino(dentry);
+ dput(dentry);
+ return 1;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -114,7 +134,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
.for_reclaim = 0,
};
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(inode->i_sb))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -129,12 +149,13 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_lock(&inode->i_mutex);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
-
+ /*
+ * Both of fdatasync() and fsync() are able to be recovered from
+ * sudden-power-off.
+ */
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- else if (is_cp_file(inode))
+ else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
need_cp = true;
@@ -142,11 +163,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
need_cp = true;
if (need_cp) {
+ nid_t pino;
+
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ F2FS_I(inode)->i_pino = pino;
+ file_got_pino(inode);
+ mark_inode_dirty_sync(inode);
+ ret = f2fs_write_inode(inode, NULL);
+ if (ret)
+ goto out;
+ }
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
@@ -168,14 +201,14 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct f2fs_node *raw_node;
__le32 *addr;
- raw_node = page_address(dn->node_page);
+ raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + ofs;
for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
@@ -185,10 +218,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
update_extent_cache(NULL_ADDR, dn);
invalidate_blocks(sbi, blkaddr);
- dec_valid_block_count(sbi, dn->inode, 1);
nr_free++;
}
if (nr_free) {
+ dec_valid_block_count(sbi, dn->inode, nr_free);
set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
@@ -291,7 +324,7 @@ void f2fs_truncate(struct inode *inode)
}
}
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -387,7 +420,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
f2fs_balance_fs(sbi);
ilock = mutex_lock_op(sbi);
- page = get_new_data_page(inode, index, false);
+ page = get_new_data_page(inode, NULL, index, false);
mutex_unlock_op(sbi, ilock);
if (!IS_ERR(page)) {
@@ -575,10 +608,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case F2FS_IOC_GETFLAGS:
flags = fi->i_flags & FS_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
- case FS_IOC_SETFLAGS:
+ case F2FS_IOC_SETFLAGS:
{
unsigned int oldflags;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..e6b3ffd5ff6a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -29,10 +29,11 @@ static struct kmem_cache *winode_slab;
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
+ struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
long wait_ms;
- wait_ms = GC_THREAD_MIN_SLEEP_TIME;
+ wait_ms = gc_th->min_sleep_time;
do {
if (try_to_freeze())
@@ -45,7 +46,7 @@ static int gc_thread_func(void *data)
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
- wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
continue;
}
@@ -66,21 +67,23 @@ static int gc_thread_func(void *data)
continue;
if (!is_idle(sbi)) {
- wait_ms = increase_sleep_time(wait_ms);
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
if (has_enough_invalid_blocks(sbi))
- wait_ms = decrease_sleep_time(wait_ms);
+ wait_ms = decrease_sleep_time(gc_th, wait_ms);
else
- wait_ms = increase_sleep_time(wait_ms);
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->bg_gc++;
+#endif
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
- wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
+ wait_ms = gc_th->no_gc_sleep_time;
} while (!kthread_should_stop());
return 0;
}
@@ -89,23 +92,34 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
+ int err = 0;
if (!test_opt(sbi, BG_GC))
- return 0;
+ goto out;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th)
- return -ENOMEM;
+ if (!gc_th) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
+ gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
+ gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
+
+ gc_th->gc_idle = 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
+ err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
- return -ENOMEM;
}
- return 0;
+
+out:
+ return err;
}
void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -118,9 +132,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread = NULL;
}
-static int select_gc_type(int gc_type)
+static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
{
- return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+ int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+
+ if (gc_th && gc_th->gc_idle) {
+ if (gc_th->gc_idle == 1)
+ gc_mode = GC_CB;
+ else if (gc_th->gc_idle == 2)
+ gc_mode = GC_GREEDY;
+ }
+ return gc_mode;
}
static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
@@ -133,7 +155,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->dirty_segmap = dirty_i->dirty_segmap[type];
p->ofs_unit = 1;
} else {
- p->gc_mode = select_gc_type(gc_type);
+ p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
@@ -234,14 +256,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno;
+ unsigned int secno, max_cost;
int nsearched = 0;
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = get_max_cost(sbi, &p);
+ p.min_cost = max_cost = get_max_cost(sbi, &p);
mutex_lock(&dirty_i->seglist_lock);
@@ -280,7 +302,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = cost;
}
- if (cost == get_max_cost(sbi, &p))
+ if (cost == max_cost)
continue;
if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +310,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
break;
}
}
-got_it:
if (p.min_segno != NULL_SEGNO) {
+got_it:
if (p.alloc_mode == LFS) {
secno = GET_SECNO(sbi, p.min_segno);
if (gc_type == FG_GC)
@@ -314,28 +336,21 @@ static const struct victim_selection default_v_ops = {
static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
{
- struct list_head *this;
struct inode_entry *ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
+ list_for_each_entry(ie, ilist, list)
if (ie->inode->i_ino == ino)
return ie->inode;
- }
return NULL;
}
static void add_gc_inode(struct inode *inode, struct list_head *ilist)
{
- struct list_head *this;
- struct inode_entry *new_ie, *ie;
+ struct inode_entry *new_ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
- if (ie->inode == inode) {
- iput(inode);
- return;
- }
+ if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ iput(inode);
+ return;
}
repeat:
new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
@@ -407,8 +422,7 @@ next_step:
/* set page dirty and write it */
if (gc_type == FG_GC) {
- f2fs_submit_bio(sbi, NODE, true);
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
@@ -508,10 +522,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
} else {
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- if (PageWriteback(page)) {
- f2fs_submit_bio(sbi, DATA, true);
- wait_on_page_writeback(page);
- }
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page) &&
S_ISDIR(inode->i_mode)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 2c6a6bd08322..c22dee9f1422 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,9 +13,9 @@
* whether IO subsystem is idle
* or not
*/
-#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
-#define GC_THREAD_MAX_SLEEP_TIME 60000
-#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
+#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
+#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
+#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
@@ -25,6 +25,14 @@
struct f2fs_gc_kthread {
struct task_struct *f2fs_gc_task;
wait_queue_head_t gc_wait_queue_head;
+
+ /* for gc sleep time */
+ unsigned int min_sleep_time;
+ unsigned int max_sleep_time;
+ unsigned int no_gc_sleep_time;
+
+ /* for changing gc mode */
+ unsigned int gc_idle;
};
struct inode_entry {
@@ -56,25 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
-static inline long increase_sleep_time(long wait)
+static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
{
- if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+ if (wait == gc_th->no_gc_sleep_time)
return wait;
- wait += GC_THREAD_MIN_SLEEP_TIME;
- if (wait > GC_THREAD_MAX_SLEEP_TIME)
- wait = GC_THREAD_MAX_SLEEP_TIME;
+ wait += gc_th->min_sleep_time;
+ if (wait > gc_th->max_sleep_time)
+ wait = gc_th->max_sleep_time;
return wait;
}
-static inline long decrease_sleep_time(long wait)
+static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
{
- if (wait == GC_THREAD_NOGC_SLEEP_TIME)
- wait = GC_THREAD_MAX_SLEEP_TIME;
+ if (wait == gc_th->no_gc_sleep_time)
+ wait = gc_th->max_sleep_time;
- wait -= GC_THREAD_MIN_SLEEP_TIME;
- if (wait <= GC_THREAD_MIN_SLEEP_TIME)
- wait = GC_THREAD_MIN_SLEEP_TIME;
+ wait -= gc_th->min_sleep_time;
+ if (wait <= gc_th->min_sleep_time)
+ wait = gc_th->min_sleep_time;
return wait;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..9ab81e7472c5 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -56,7 +56,7 @@ static int do_read_inode(struct inode *inode)
if (IS_ERR(node_page))
return PTR_ERR(node_page);
- rn = page_address(node_page);
+ rn = F2FS_NODE(node_page);
ri = &(rn->i);
inode->i_mode = le16_to_cpu(ri->i_mode);
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
-
- if (!sbi->por_doing && inode->i_nlink == 0) {
- ret = -ENOENT;
- goto bad_inode;
- }
-
make_now:
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
- __GFP_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -158,9 +151,9 @@ void update_inode(struct inode *inode, struct page *node_page)
struct f2fs_node *rn;
struct f2fs_inode *ri;
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, false);
- rn = page_address(node_page);
+ rn = F2FS_NODE(node_page);
ri = &(rn->i);
ri->i_mode = cpu_to_le16(inode->i_mode);
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
set_cold_node(inode, node_page);
set_page_dirty(node_page);
+ clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
+ if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ return 0;
+
if (wbc)
f2fs_balance_fs(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..4e475181280c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
int count = le32_to_cpu(sbi->raw_super->extension_count);
for (i = 0; i < count; i++) {
if (is_multimedia_file(name, extlist[i])) {
- set_cold_file(inode);
+ file_set_cold(inode);
break;
}
}
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
- if (!sbi->por_doing)
- d_instantiate(dentry, inode);
+ d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi);
inode->i_ctime = CURRENT_TIME;
- atomic_inc(&inode->i_count);
+ ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto out;
- /*
- * This file should be checkpointed during fsync.
- * We lost i_pino from now on.
- */
- set_cp_file(inode);
-
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- make_bad_inode(inode);
iput(inode);
return err;
}
@@ -247,7 +239,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (!de)
goto fail;
- err = check_orphan_space(sbi);
+ err = acquire_orphan_inode(sbi);
if (err) {
kunmap(page);
f2fs_put_page(page, 0);
@@ -401,7 +393,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page;
- struct page *old_page;
+ struct page *old_page, *new_page;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
@@ -423,7 +415,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
ilock = mutex_lock_op(sbi);
if (new_inode) {
- struct page *new_page;
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
@@ -435,14 +426,27 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry)
goto out_dir;
+ err = acquire_orphan_inode(sbi);
+ if (err)
+ goto put_out_dir;
+
+ if (update_dent_inode(old_inode, &new_dentry->d_name)) {
+ release_orphan_inode(sbi);
+ goto put_out_dir;
+ }
+
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = CURRENT_TIME;
if (old_dir_entry)
drop_nlink(new_inode);
drop_nlink(new_inode);
+
if (!new_inode->i_nlink)
add_orphan_inode(sbi, new_inode->i_ino);
+ else
+ release_orphan_inode(sbi);
+
update_inode_page(new_inode);
} else {
err = f2fs_add_link(new_dentry, old_inode);
@@ -475,6 +479,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
mutex_unlock_op(sbi, ilock);
return 0;
+put_out_dir:
+ f2fs_put_page(new_page, 1);
out_dir:
if (old_dir_entry) {
kunmap(old_dir_page);
@@ -498,6 +504,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
.rename = f2fs_rename,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +519,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
@@ -522,6 +530,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
};
const struct inode_operations f2fs_special_inode_operations = {
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..f5172e271d46 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
level = get_node_path(index, offset, noffset);
nids[0] = dn->inode->i_ino;
- npage[0] = get_node_page(sbi, nids[0]);
- if (IS_ERR(npage[0]))
- return PTR_ERR(npage[0]);
+ npage[0] = dn->inode_page;
+ if (!npage[0]) {
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+ }
parent = npage[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i]);
+ npage[i] = new_node_page(dn, noffset[i], NULL);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
err = PTR_ERR(npage[i]);
@@ -562,7 +565,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
return PTR_ERR(page);
}
- rn = (struct f2fs_node *)page_address(page);
+ rn = F2FS_NODE(page);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
child_nid = le32_to_cpu(rn->in.nid[i]);
@@ -695,7 +698,7 @@ restart:
set_new_dnode(&dn, inode, page, NULL, 0);
unlock_page(page);
- rn = page_address(page);
+ rn = F2FS_NODE(page);
switch (level) {
case 0:
case 1:
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
return 0;
}
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
{
- struct page *page;
struct dnode_of_data dn;
/* allocate inode page for new inode */
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- page = new_node_page(&dn, 0);
- init_dent_inode(name, page);
- if (IS_ERR(page))
- return PTR_ERR(page);
- f2fs_put_page(page, 1);
- return 0;
+
+ /* caller should f2fs_put_page(page, 1); */
+ return new_node_page(&dn, 0, NULL);
}
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+ unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
set_cold_node(dn->inode, page);
dn->node_page = page;
- sync_inode_page(dn);
+ if (ipage)
+ update_inode(dn->inode, ipage);
+ else
+ sync_inode_page(dn);
set_page_dirty(page);
if (ofs == 0)
inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1480,8 +1484,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
- src = (struct f2fs_node *)page_address(page);
- dst = (struct f2fs_node *)page_address(ipage);
+ src = F2FS_NODE(page);
+ dst = F2FS_NODE(ipage);
memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
dst->i.i_size = 0;
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
new_ni = old_ni;
new_ni.ino = ino;
+ if (!inc_valid_node_count(sbi, NULL, 1))
+ WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR);
inc_valid_inode_count(sbi);
-
f2fs_put_page(ipage, 1);
return 0;
}
@@ -1530,7 +1535,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
goto out;
lock_page(page);
- rn = (struct f2fs_node *)page_address(page);
+ rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
sum_entry->version = 0;
sum_entry->ofs_in_node = 0;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..87349c4ea0ef 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -155,8 +155,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
static inline void fill_node_footer(struct page *page, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
if (reset)
memset(rn, 0, sizeof(*rn));
rn->footer.nid = cpu_to_le32(nid);
@@ -166,10 +165,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
static inline void copy_node_footer(struct page *dst, struct page *src)
{
- void *src_addr = page_address(src);
- void *dst_addr = page_address(dst);
- struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
- struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
+ struct f2fs_node *src_rn = F2FS_NODE(src);
+ struct f2fs_node *dst_rn = F2FS_NODE(dst);
memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
}
@@ -177,45 +174,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
+
rn->footer.cp_ver = ckpt->checkpoint_ver;
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
static inline nid_t ino_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.ino);
}
static inline nid_t nid_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.nid);
}
static inline unsigned int ofs_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
unsigned flag = le32_to_cpu(rn->footer.flag);
return flag >> OFFSET_BIT_SHIFT;
}
static inline unsigned long long cpver_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le64_to_cpu(rn->footer.cp_ver);
}
static inline block_t next_blkaddr_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.next_blkaddr);
}
@@ -250,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page)
static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ struct f2fs_node *rn = F2FS_NODE(p);
wait_on_page_writeback(p);
@@ -263,7 +255,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
static inline nid_t get_nid(struct page *p, int off, bool i)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ struct f2fs_node *rn = F2FS_NODE(p);
+
if (i)
return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
return le32_to_cpu(rn->in.nid[off]);
@@ -275,25 +268,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+ return F2FS_I(inode)->i_advise & type;
}
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
{
- F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ F2FS_I(inode)->i_advise |= type;
}
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+ F2FS_I(inode)->i_advise &= ~type;
}
-static inline void set_cp_file(struct inode *inode)
-{
- F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
-}
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
static inline int is_cold_data(struct page *page)
{
@@ -310,33 +305,19 @@ static inline void clear_cold_data(struct page *page)
ClearPageChecked(page);
}
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << COLD_BIT_SHIFT);
+ struct f2fs_node *rn = F2FS_NODE(page);
+ return le32_to_cpu(rn->footer.flag) & (1 << type);
}
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-
-static inline unsigned char is_dent_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
static inline void set_cold_node(struct inode *inode, struct page *page)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+ struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (S_ISDIR(inode->i_mode))
@@ -346,26 +327,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_fsync_mark(struct page *page, int mark)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (mark)
- flag |= (0x1 << FSYNC_BIT_SHIFT);
- else
- flag &= ~(0x1 << FSYNC_BIT_SHIFT);
- rn->footer.flag = cpu_to_le32(flag);
-}
-
-static inline void set_dentry_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
- flag |= (0x1 << DENT_BIT_SHIFT);
+ flag |= (0x1 << type);
else
- flag &= ~(0x1 << DENT_BIT_SHIFT);
+ flag &= ~(0x1 << type);
rn->footer.flag = cpu_to_le32(flag);
}
+#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..639eb3465286 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,45 +40,64 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ struct f2fs_node *raw_node = F2FS_NODE(ipage);
struct f2fs_inode *raw_inode = &(raw_node->i);
- struct qstr name;
+ nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
+ struct qstr name;
struct page *page;
- struct inode *dir;
+ struct inode *dir, *einode;
int err = 0;
- if (!is_dent_dnode(ipage))
- goto out;
-
- dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+ if (!dir) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+ set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ add_dirty_dir_inode(dir);
}
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
-
+retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de) {
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
kunmap(page);
f2fs_put_page(page, 0);
- } else {
- err = __f2fs_add_link(dir, &name, inode);
+ goto out;
}
- iput(dir);
+ if (de) {
+ einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ if (IS_ERR(einode)) {
+ WARN_ON(1);
+ if (PTR_ERR(einode) == -ENOENT)
+ err = -EEXIST;
+ goto out;
+ }
+ f2fs_delete_entry(de, page, einode);
+ iput(einode);
+ goto retry;
+ }
+ err = __f2fs_add_link(dir, &name, inode);
out:
- kunmap(ipage);
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+ "ino = %x, name = %s, dir = %lx, err = %d",
+ ino_of_node(ipage), raw_inode->i_name,
+ IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
static int recover_inode(struct inode *inode, struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+ struct f2fs_node *raw_node = F2FS_NODE(node_page);
struct f2fs_inode *raw_inode = &(raw_node->i);
+ if (!IS_INODE(node_page))
+ return 0;
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_size_write(inode, le64_to_cpu(raw_inode->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +107,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- return recover_dentry(node_page, inode);
+ if (is_dent_dnode(node_page))
+ return recover_dentry(node_page, inode);
+
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+ ino_of_node(node_page), raw_inode->i_name);
+ return 0;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +143,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
if (entry) {
- entry->blkaddr = blkaddr;
if (IS_INODE(page) && is_dent_dnode(page))
set_inode_flag(F2FS_I(entry->inode),
FI_INC_LINK);
@@ -134,48 +157,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
- goto unlock_out;
+ break;
}
/* add this fsync inode to the list */
entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
if (!entry) {
err = -ENOMEM;
- goto unlock_out;
+ break;
}
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
- goto unlock_out;
+ break;
}
-
list_add_tail(&entry->list, head);
- entry->blkaddr = blkaddr;
- }
- if (IS_INODE(page)) {
- err = recover_inode(entry->inode, page);
- if (err == -ENOENT) {
- goto next;
- } else if (err) {
- err = -EINVAL;
- goto unlock_out;
- }
}
+ entry->blkaddr = blkaddr;
+
+ err = recover_inode(entry->inode, page);
+ if (err && err != -ENOENT)
+ break;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
return err;
}
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
- struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
@@ -186,15 +201,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
}
}
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr, struct dnode_of_data *dn)
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
(sbi->blocks_per_seg - 1);
struct f2fs_summary sum;
- nid_t ino;
+ nid_t ino, nid;
void *kaddr;
struct inode *inode;
struct page *node_page;
@@ -203,7 +218,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
- return;
+ return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +237,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 1);
}
+ /* Use the locked dnode page and inode */
+ nid = le32_to_cpu(sum.nid);
+ if (dn->inode->i_ino == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.nid = nid;
+ tdn.node_page = dn->inode_page;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ } else if (dn->nid == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ }
+
/* Get the node page */
- node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
bidx = start_bidx_of_node(ofs_of_node(node_page)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
- return;
+ return PTR_ERR(inode);
truncate_hole(inode, bidx, bidx + 1);
iput(inode);
+ return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +279,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- int err = 0;
+ int err = 0, recovered = 0;
int ilock;
start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +317,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* Check the previous node page having this index */
- check_index_in_prev_nodes(sbi, dest);
+ err = check_index_in_prev_nodes(sbi, dest, &dn);
+ if (err)
+ goto err;
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* write dummy data page */
recover_data_page(sbi, NULL, &sum, src, dest);
update_extent_cache(dest, &dn);
+ recovered++;
}
dn.ofs_in_node++;
}
@@ -305,9 +342,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
- return 0;
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+ "recovered_data = %d blocks, err = %d",
+ inode->i_ino, recovered, err);
+ return err;
}
static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +382,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
@@ -348,7 +390,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
err = do_recover_data(sbi, entry->inode, page, blkaddr);
if (err)
- goto out;
+ break;
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -359,7 +401,6 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
@@ -382,6 +423,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
+ sbi->por_doing = 1;
err = find_fsync_dnodes(sbi, &inode_list);
if (err)
goto out;
@@ -390,13 +432,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
goto out;
/* step #2: recover data */
- sbi->por_doing = 1;
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- sbi->por_doing = 0;
BUG_ON(!list_empty(&inode_list));
out:
- destroy_fsync_dnodes(sbi, &inode_list);
+ destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
- write_checkpoint(sbi, false);
+ sbi->por_doing = 0;
+ if (!err)
+ write_checkpoint(sbi, false);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..68e344f9e042 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
* Adding dirty entry into seglist is not critical operation.
* If a given segment is one of current working segments, it won't be added.
*/
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
__set_test_and_free(sbi, segno);
- offset = segno + 1;
}
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
- offset = segno + 1;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
dirty_i->nr_dirty[PRE]--;
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
* This function should be resided under the curseg_mutex lock
*/
static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
- struct f2fs_summary *sum, unsigned short offset)
+ struct f2fs_summary *sum)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
void *addr = curseg->sum_blk;
- addr += offset * sizeof(struct f2fs_summary);
+ addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
return;
}
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
- unsigned int segno;
- unsigned int ofs = 0;
-
- /*
- * If there is not enough reserved sections,
- * we should not reuse prefree segments.
- */
- if (has_not_enough_free_secs(sbi, 0))
- return NULL_SEGNO;
-
- /*
- * NODE page should not reuse prefree segment,
- * since those information is used for SPOR.
- */
- if (IS_NODESEG(type))
- return NULL_SEGNO;
-next:
- segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
- ofs += sbi->segs_per_sec;
-
- if (segno < TOTAL_SEGS(sbi)) {
- int i;
-
- /* skip intermediate segments in a section */
- if (segno % sbi->segs_per_sec)
- goto next;
-
- /* skip if the section is currently used */
- if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
- goto next;
-
- /* skip if whole section is not prefree */
- for (i = 1; i < sbi->segs_per_sec; i++)
- if (!test_bit(segno + i, prefree_segmap))
- goto next;
-
- /* skip if whole section was not free at the last checkpoint */
- for (i = 0; i < sbi->segs_per_sec; i++)
- if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
- goto next;
-
- return segno;
- }
- return NULL_SEGNO;
-}
-
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int segno = curseg->segno;
+ unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
- return !test_bit(segno + 1, free_i->free_segmap);
+ if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
int dir = ALLOC_LEFT;
write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ GET_SUM_BLOCK(sbi, segno));
if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
goto out;
}
- curseg->next_segno = check_prefree_segments(sbi, type);
-
- if (curseg->next_segno != NULL_SEGNO)
- change_curseg(sbi, type, false);
- else if (type == CURSEG_WARM_NODE)
+ if (type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
else
new_curseg(sbi, type, false);
out:
+#ifdef CONFIG_F2FS_STAT_FS
sbi->segment_count[curseg->alloc_type]++;
+#endif
+ return;
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -664,18 +611,12 @@ static void f2fs_end_io_write(struct bio *bio, int err)
struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
{
struct bio *bio;
- struct bio_private *priv;
-retry:
- priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
- if (!priv) {
- cond_resched();
- goto retry;
- }
/* No failure on bio allocation */
bio = bio_alloc(GFP_NOIO, npages);
bio->bi_bdev = bdev;
- bio->bi_private = priv;
+ bio->bi_private = NULL;
+
return bio;
}
@@ -734,8 +675,17 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
do_submit_bio(sbi, type, false);
alloc_new:
if (sbi->bio[type] == NULL) {
+ struct bio_private *priv;
+retry:
+ priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
+ if (!priv) {
+ cond_resched();
+ goto retry;
+ }
+
sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ sbi->bio[type]->bi_private = priv;
/*
* The end_io will be assigned at the sumbission phase.
* Until then, let bio_add_page() merge consecutive IOs as much
@@ -755,6 +705,16 @@ alloc_new:
trace_f2fs_submit_write_page(page, blk_addr, type);
}
+void f2fs_wait_on_page_writeback(struct page *page,
+ enum page_type type, bool sync)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ if (PageWriteback(page)) {
+ f2fs_submit_bio(sbi, type, sync);
+ wait_on_page_writeback(page);
+ }
+}
+
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -795,7 +755,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || is_cold_file(inode))
+ else if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
else
return CURSEG_WARM_DATA;
@@ -844,11 +804,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
* because, this function updates a summary entry in the
* current summary block.
*/
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->block_count[curseg->alloc_type]++;
+#endif
/*
* SIT information should be updated before segment allocation,
@@ -943,7 +905,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -980,7 +942,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
}
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
if (next_segno != segno) {
@@ -1579,13 +1541,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0;
+ unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
unsigned short valid_blocks;
- while (segno < TOTAL_SEGS(sbi)) {
+ while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
- if (segno >= TOTAL_SEGS(sbi))
+ segno = find_next_inuse(free_i, total_segs, offset);
+ if (segno >= total_segs)
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..66d1ec137e44 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -18,23 +18,28 @@
#include <linux/parser.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
#include <linux/random.h>
#include <linux/exportfs.h>
#include <linux/blkdev.h>
#include <linux/f2fs_fs.h>
+#include <linux/sysfs.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "xattr.h"
+#include "gc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
+static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
+static struct kset *f2fs_kset;
enum {
- Opt_gc_background_off,
+ Opt_gc_background,
Opt_disable_roll_forward,
Opt_discard,
Opt_noheap,
@@ -46,7 +51,7 @@ enum {
};
static match_table_t f2fs_tokens = {
- {Opt_gc_background_off, "background_gc_off"},
+ {Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
@@ -57,6 +62,113 @@ static match_table_t f2fs_tokens = {
{Opt_err, NULL},
};
+/* Sysfs support for f2fs */
+struct f2fs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
+ ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
+ const char *, size_t);
+ int offset;
+};
+
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned int *ui;
+
+ if (!gc_kth)
+ return -EINVAL;
+
+ ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned long t;
+ unsigned int *ui;
+ ssize_t ret;
+
+ if (!gc_kth)
+ return -EINVAL;
+
+ ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret < 0)
+ return ret;
+ *ui = t;
+ return count;
+}
+
+static ssize_t f2fs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_sb_release(struct kobject *kobj)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .offset = offsetof(struct f2fs_gc_kthread, _elname), \
+}
+
+#define F2FS_RW_ATTR(name, elname) \
+ F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname)
+
+F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(gc_idle, gc_idle);
+
+#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
+static struct attribute *f2fs_attrs[] = {
+ ATTR_LIST(gc_min_sleep_time),
+ ATTR_LIST(gc_max_sleep_time),
+ ATTR_LIST(gc_no_gc_sleep_time),
+ ATTR_LIST(gc_idle),
+ NULL,
+};
+
+static const struct sysfs_ops f2fs_attr_ops = {
+ .show = f2fs_attr_show,
+ .store = f2fs_attr_store,
+};
+
+static struct kobj_type f2fs_ktype = {
+ .default_attrs = f2fs_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+ .release = f2fs_sb_release,
+};
+
void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
@@ -76,6 +188,91 @@ static void init_once(void *foo)
inode_init_once(&fi->vfs_inode);
}
+static int parse_options(struct super_block *sb, char *options)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *name;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strncmp(name, "on", 2))
+ set_opt(sbi, BG_GC);
+ else if (!strncmp(name, "off", 3))
+ clear_opt(sbi, BG_GC);
+ else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
@@ -112,6 +309,17 @@ static int f2fs_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+ set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ return;
+}
+
static void f2fs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -127,6 +335,12 @@ static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ if (sbi->s_proc) {
+ remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry(sb->s_id, f2fs_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
@@ -140,6 +354,8 @@ static void f2fs_put_super(struct super_block *sb)
destroy_segment_manager(sbi);
kfree(sbi->ckpt);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
brelse(sbi->raw_super_buf);
@@ -170,7 +386,7 @@ static int f2fs_freeze(struct super_block *sb)
{
int err;
- if (sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(sb))
return 0;
err = f2fs_sync_fs(sb, 1);
@@ -214,10 +430,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (test_opt(sbi, BG_GC))
- seq_puts(seq, ",background_gc_on");
+ if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+ seq_printf(seq, ",background_gc=%s", "on");
else
- seq_puts(seq, ",background_gc_off");
+ seq_printf(seq, ",background_gc=%s", "off");
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -244,11 +460,94 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static int segment_info_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i;
+
+ for (i = 0; i < total_segs; i++) {
+ seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
+ if (i != 0 && (i % 10) == 0)
+ seq_puts(seq, "\n");
+ else
+ seq_puts(seq, " ");
+ }
+ return 0;
+}
+
+static int segment_info_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations f2fs_seq_segment_info_fops = {
+ .owner = THIS_MODULE,
+ .open = segment_info_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_mount_info org_mount_opt;
+ int err, active_logs;
+
+ /*
+ * Save the old mount options in case we
+ * need to restore them.
+ */
+ org_mount_opt = sbi->mount_opt;
+ active_logs = sbi->active_logs;
+
+ /* parse mount options */
+ err = parse_options(sb, data);
+ if (err)
+ goto restore_opts;
+
+ /*
+ * Previous and new state of filesystem is RO,
+ * so no point in checking GC conditions.
+ */
+ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ goto skip;
+
+ /*
+ * We stop the GC thread if FS is mounted as RO
+ * or if background_gc = off is passed in mount
+ * option. Also sync the filesystem.
+ */
+ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if (sbi->gc_thread) {
+ stop_gc_thread(sbi);
+ f2fs_sync_fs(sb, 1);
+ }
+ } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ err = start_gc_thread(sbi);
+ if (err)
+ goto restore_opts;
+ }
+skip:
+ /* Update the POSIXACL Flag */
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ return 0;
+
+restore_opts:
+ sbi->mount_opt = org_mount_opt;
+ sbi->active_logs = active_logs;
+ return err;
+}
+
static struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
.drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
.write_inode = f2fs_write_inode,
+ .dirty_inode = f2fs_dirty_inode,
.show_options = f2fs_show_options,
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
@@ -256,6 +555,7 @@ static struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
+ .remount_fs = f2fs_remount,
};
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +603,6 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
- char *options)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int arg = 0;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
-
- switch (token) {
- case Opt_gc_background_off:
- clear_opt(sbi, BG_GC);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_discard:
- set_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
-#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
-#else
- case Opt_nouser_xattr:
- f2fs_msg(sb, KERN_INFO,
- "nouser_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
-#else
- case Opt_noacl:
- f2fs_msg(sb, KERN_INFO, "noacl options not supported");
- break;
-#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
- return -EINVAL;
- sbi->active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- default:
- f2fs_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- }
- return 0;
-}
-
static loff_t max_file_size(unsigned bits)
{
loff_t result = ADDRS_PER_INODE;
@@ -541,6 +768,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto free_sb_buf;
}
+ sb->s_fs_info = sbi;
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -553,7 +781,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- err = parse_options(sb, sbi, (char *)data);
+ err = parse_options(sb, (char *)data);
if (err)
goto free_sb_buf;
@@ -565,7 +793,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
- sb->s_fs_info = sbi;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,15 +901,28 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
"Cannot recover all fsync data errno=%ld", err);
}
- /* After POR, we can run background GC thread */
- err = start_gc_thread(sbi);
- if (err)
- goto fail;
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+ }
err = f2fs_build_stats(sbi);
if (err)
goto fail;
+ if (f2fs_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
+
+ if (sbi->s_proc)
+ proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_info_fops, sb);
+
if (test_opt(sbi, DISCARD)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q))
@@ -691,6 +931,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
"the device does not support discard");
}
+ sbi->s_kobj.kset = f2fs_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto fail;
+
return 0;
fail:
stop_gc_thread(sbi);
@@ -765,22 +1012,28 @@ static int __init init_f2fs_fs(void)
err = create_checkpoint_caches();
if (err)
goto fail;
+ f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
+ if (!f2fs_kset)
+ goto fail;
err = register_filesystem(&f2fs_fs_type);
if (err)
goto fail;
f2fs_create_root_stats();
+ f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
fail:
return err;
}
static void __exit exit_f2fs_fs(void)
{
+ remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
destroy_checkpoint_caches();
destroy_gc_caches();
destroy_node_manager_caches();
destroy_inodecache();
+ kset_unregister(f2fs_kset);
}
module_init(init_f2fs_fs)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
*/
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
+#include <linux/security.h>
#include "f2fs.h"
#include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ break;
default:
return -EINVAL;
}
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
memcpy(list, prefix, prefix_len);
- memcpy(list+prefix_len, name, name_len);
+ memcpy(list + prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name,
- buffer, size);
+ return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
}
static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return 0;
}
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *page)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, (struct page *)page);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &f2fs_initxattrs, ipage);
+}
+#endif
+
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
.set = f2fs_xattr_advise_set,
};
+const struct xattr_handler f2fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = F2FS_XATTR_INDEX_SECURITY,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
#endif
[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
&f2fs_xattr_acl_default_handler,
#endif
&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ &f2fs_xattr_security_handler,
+#endif
&f2fs_xattr_advise_handler,
NULL,
};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
return -ENODATA;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return 0;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
}
int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len)
+ const void *value, size_t value_len, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
mark_inode_dirty(inode);
- page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
if (IS_ERR(page)) {
alloc_nid_failed(sbi, fi->i_xattr_nid);
fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
- update_inode_page(inode);
+ if (ipage)
+ update_inode(inode, ipage);
+ else
+ update_inode_page(inode);
mutex_unlock_op(sbi, ilock);
return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_acl_access_handler;
extern const struct xattr_handler f2fs_xattr_acl_default_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+ const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len)
+ const char *name, const void *value, size_t value_len)
{
return -EOPNOTSUPP;
}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
}
#endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+ const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return 0;
+}
+#endif
#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
const void *dir_ops; /* Opaque; default directory operations */
int dir_per_block; /* dir entries per block */
int dir_per_block_bits; /* log2(dir_per_block) */
+ unsigned int vol_id; /*volume ID*/
int fatent_shift;
struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
return err;
}
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ return put_user(sbi->vol_id, user_attr);
+}
+
long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return fat_ioctl_get_attributes(inode, user_attr);
case FAT_IOCTL_SET_ATTRIBUTES:
return fat_ioctl_set_attributes(filp, user_attr);
+ case FAT_IOCTL_GET_VOLUME_ID:
+ return fat_ioctl_get_volume_id(inode, user_attr);
default:
return -ENOTTY; /* Inappropriate ioctl for device */
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
brelse(fsinfo_bh);
}
+ /* interpret volume ID as a little endian 32 bit integer */
+ if (sbi->fat_bits == 32)
+ sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+ ((u32)b->fat32.vol_id[1] << 8) |
+ ((u32)b->fat32.vol_id[2] << 16) |
+ ((u32)b->fat32.vol_id[3] << 24));
+ else /* fat 16 or 12 */
+ sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+ ((u32)b->fat16.vol_id[1] << 8) |
+ ((u32)b->fat16.vol_id[2] << 16) |
+ ((u32)b->fat16.vol_id[3] << 24));
+
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+ fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
va_end(args);
}
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
sb->s_flags |= MS_RDONLY;
- printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
- "set read-only\n", sb->s_id);
+ fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
}
}
EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6599222536eb..65343c3741ff 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -730,14 +730,14 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH
+ __FMODE_EXEC | O_PATH | __O_TMPFILE
));
fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index 08e719b884ca..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
mntput(mnt);
}
-static DEFINE_SPINLOCK(delayed_fput_lock);
-static LIST_HEAD(delayed_fput_list);
+static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
- LIST_HEAD(head);
- spin_lock_irq(&delayed_fput_lock);
- list_splice_init(&delayed_fput_list, &head);
- spin_unlock_irq(&delayed_fput_lock);
- while (!list_empty(&head)) {
- struct file *f = list_first_entry(&head, struct file, f_u.fu_list);
- list_del_init(&f->f_u.fu_list);
- __fput(f);
+ struct llist_node *node = llist_del_all(&delayed_fput_list);
+ struct llist_node *next;
+
+ for (; node; node = next) {
+ next = llist_next(node);
+ __fput(llist_entry(node, struct file, f_u.fu_llist));
}
}
@@ -306,18 +303,22 @@ void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- unsigned long flags;
file_sb_list_del(file);
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
return;
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. free_ipc_ns()->
+ * shm_destroy() can do this. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
- spin_lock_irqsave(&delayed_fput_lock, flags);
- list_add(&file->f_u.fu_list, &delayed_fput_list);
- schedule_work(&delayed_fput_work);
- spin_unlock_irqrestore(&delayed_fput_lock, flags);
+
+ if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+ schedule_work(&delayed_fput_work);
}
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
- * I/O completion.
+ * I/O completion. We don't do it for sync(2) writeback because it has a
+ * separate, external IO completion path and ->sync_fs for guaranteeing
+ * inode metadata is written back correctly.
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
+ .for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -959,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
/*
* Retrieve work items and do the writeback they describe
*/
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
@@ -967,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
- /*
- * Override sync mode, in case we must wait for completion
- * because this thread is exiting now.
- */
- if (force_wait)
- work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
@@ -1021,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
* rescuer as work_list needs to be drained.
*/
do {
- pages_written = wb_do_writeback(wb, 0);
+ pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
@@ -1362,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb)
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
+ .for_sync = 1,
};
/* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
struct fscache_object, cookie_link);
cache = object->cache;
- if (object->state >= FSCACHE_OBJECT_DYING ||
+ if (fscache_object_is_dying(object) ||
test_bit(FSCACHE_IOERROR, &cache->flags))
cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
BUG_ON(!ifsdef);
cache->flags = 0;
- ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+ ifsdef->event_mask =
+ ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
if (!tagname)
tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
{
struct fscache_object *object;
- spin_lock(&cache->object_list_lock);
-
while (!list_empty(&cache->object_list)) {
- object = list_entry(cache->object_list.next,
- struct fscache_object, cache_link);
- list_move_tail(&object->cache_link, dying_objects);
+ spin_lock(&cache->object_list_lock);
- _debug("withdraw %p", object->cookie);
+ if (!list_empty(&cache->object_list)) {
+ object = list_entry(cache->object_list.next,
+ struct fscache_object, cache_link);
+ list_move_tail(&object->cache_link, dying_objects);
- spin_lock(&object->lock);
- spin_unlock(&cache->object_list_lock);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
- spin_unlock(&object->lock);
+ _debug("withdraw %p", object->cookie);
+
+ /* This must be done under object_list_lock to prevent
+ * a race with fscache_drop_object().
+ */
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ }
+ spin_unlock(&cache->object_list_lock);
cond_resched();
- spin_lock(&cache->object_list_lock);
}
-
- spin_unlock(&cache->object_list_lock);
}
/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
atomic_set(&cookie->usage, 1);
atomic_set(&cookie->n_children, 0);
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
atomic_inc(&parent->usage);
atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
cookie->flags =
(1 << FSCACHE_COOKIE_LOOKING_UP) |
- (1 << FSCACHE_COOKIE_CREATING) |
(1 << FSCACHE_COOKIE_NO_DATA_YET);
/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
- fscache_enqueue_object(object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
object_already_extant:
ret = -ENOBUFS;
- if (object->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dead(object)) {
spin_unlock(&cookie->lock);
goto error;
}
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
ret = -EEXIST;
hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_dying(p))
ret = -ENOBUFS;
goto cant_attach_object;
}
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
hlist_for_each_entry(p, &cookie->parent->backing_objects,
cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dying(p)) {
ret = -ENOBUFS;
spin_unlock(&cookie->parent->lock);
goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
- if (object->state < FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_live(object))
fscache_raise_event(
object, FSCACHE_OBJECT_EV_INVALIDATE);
}
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
*/
void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
{
- struct fscache_cache *cache;
struct fscache_object *object;
- unsigned long event;
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
return;
}
- _enter("%p{%s,%p},%d",
- cookie, cookie->def->name, cookie->netfs_data, retire);
+ _enter("%p{%s,%p,%d},%d",
+ cookie, cookie->def->name, cookie->netfs_data,
+ atomic_read(&cookie->n_active), retire);
+
+ ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
BUG();
}
- /* wait for the cookie to finish being instantiated (or to fail) */
- if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
- fscache_stat(&fscache_n_relinquishes_waitcrt);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- }
-
- event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+ /* No further netfs-accessing operations on this cookie permitted */
+ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (retire)
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-try_again:
spin_lock(&cookie->lock);
-
- /* break links with all the active objects */
- while (!hlist_empty(&cookie->backing_objects)) {
- int n_reads;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object,
- cookie_link);
-
- _debug("RELEASE OBJ%x", object->debug_id);
-
- set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
- n_reads = atomic_read(&object->n_reads);
- if (n_reads) {
- int n_ops = object->n_ops;
- int n_in_progress = object->n_in_progress;
- spin_unlock(&cookie->lock);
- printk(KERN_ERR "FS-Cache:"
- " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
- cookie->def->name,
- n_reads, n_ops, n_in_progress);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- printk("Wait finished\n");
- goto try_again;
- }
-
- /* detach each cache object from the object cookie */
- spin_lock(&object->lock);
- hlist_del_init(&object->cookie_link);
-
- cache = object->cache;
- object->cookie = NULL;
- fscache_raise_event(object, event);
- spin_unlock(&object->lock);
-
- if (atomic_dec_and_test(&cookie->usage))
- /* the cookie refcount shouldn't be reduced to 0 yet */
- BUG();
+ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
}
+ spin_unlock(&cookie->lock);
- /* detach pointers back to the netfs */
+ /* Wait for cessation of activity requiring access to the netfs (when
+ * n_active reaches 0).
+ */
+ if (!atomic_dec_and_test(&cookie->n_active))
+ wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+
+ /* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
-
- spin_unlock(&cookie->lock);
+ BUG_ON(cookie->stores.rnode);
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
atomic_dec(&cookie->parent->n_children);
}
- /* finally dispose of the cookie */
+ /* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
struct fscache_cookie fscache_fsdef_index = {
.usage = ATOMIC_INIT(1),
+ .n_active = ATOMIC_INIT(1),
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
extern int fscache_wait_bit(void *);
extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
/*
* object.c
*/
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
- struct fscache_object *);
extern void fscache_enqueue_object(struct fscache_object *);
/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
extern const struct file_operations fscache_objlist_fops;
extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
#else
#define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
#endif
/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+ printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+ object->debug_id, object->event_mask, (1 << event));
+#endif
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
schedule();
return 0;
}
-EXPORT_SYMBOL(fscache_wait_bit);
/*
* wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
schedule();
return signal_pending(current);
}
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
/* initialise the primary index cookie */
atomic_set(&netfs->primary_index->usage, 1);
atomic_set(&netfs->primary_index->n_children, 0);
+ atomic_set(&netfs->primary_index->n_active, 1);
netfs->primary_index->def = &fscache_fsdef_netfs_def;
netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
*/
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
{
write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-EXPORT_SYMBOL(fscache_object_destroy);
/*
* find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
{
struct fscache_objlist_data *data = m->private;
struct fscache_object *obj = v;
+ struct fscache_cookie *cookie;
unsigned long config = data->config;
- uint16_t keylen, auxlen;
char _type[3], *type;
- bool no_cookie;
u8 *buf = data->buf, *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
- " EM EV F S"
+ " EM EV FL S"
" | NETFS_COOKIE_DEF TY FL NETFS_DATA");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if ((unsigned long) v == 2) {
seq_puts(m, "======== ======== ==== ===== === === === == ====="
- " == == = ="
+ " == == == ="
" | ================ == == ================");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
} \
} while(0)
+ cookie = obj->cookie;
if (~config) {
- FILTER(obj->cookie,
+ FILTER(cookie->def,
COOKIE, NOCOOKIE);
- FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+ FILTER(fscache_object_is_active(obj) ||
obj->n_ops != 0 ||
obj->n_obj_ops != 0 ||
obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
}
seq_printf(m,
- "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+ "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
obj->debug_id,
obj->parent ? obj->parent->debug_id : -1,
- fscache_object_states_short[obj->state],
+ obj->state->short_name,
obj->n_children,
obj->n_ops,
obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->flags,
work_busy(&obj->work));
- no_cookie = true;
- keylen = auxlen = 0;
- if (obj->cookie) {
- spin_lock(&obj->lock);
- if (obj->cookie) {
- switch (obj->cookie->def->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- sprintf(_type, "%02u",
- obj->cookie->def->type);
- type = _type;
- break;
- }
+ if (fscache_use_cookie(obj)) {
+ uint16_t keylen = 0, auxlen = 0;
- seq_printf(m, "%-16s %s %2lx %16p",
- obj->cookie->def->name,
- type,
- obj->cookie->flags,
- obj->cookie->netfs_data);
-
- if (obj->cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = obj->cookie->def->get_key(
- obj->cookie->netfs_data,
- buf, 400);
-
- if (obj->cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = obj->cookie->def->get_aux(
- obj->cookie->netfs_data,
- buf + keylen, 512 - keylen);
-
- no_cookie = false;
+ switch (cookie->def->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ sprintf(_type, "%02u", cookie->def->type);
+ type = _type;
+ break;
}
- spin_unlock(&obj->lock);
- if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+ seq_printf(m, "%-16s %s %2lx %16p",
+ cookie->def->name,
+ type,
+ cookie->flags,
+ cookie->netfs_data);
+
+ if (cookie->def->get_key &&
+ config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->def->get_key(cookie->netfs_data,
+ buf, 400);
+
+ if (cookie->def->get_aux &&
+ config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->def->get_aux(cookie->netfs_data,
+ buf + keylen, 512 - keylen);
+ fscache_unuse_cookie(obj);
+
+ if (keylen > 0 || auxlen > 0) {
seq_printf(m, " ");
for (p = buf; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
seq_printf(m, "%02x", *p++);
}
}
- }
- if (no_cookie)
- seq_printf(m, "<no_cookie>\n");
- else
seq_printf(m, "\n");
+ } else {
+ seq_printf(m, "<no_netfs>\n");
+ }
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/prefetch.h>
#include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
- [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
- [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
- [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
- [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
- [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
- [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
- [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
- [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
- [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
- [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
- [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
- [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
- [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state. Work states are execution states. No event processing
+ * is performed by them. The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition. Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = f \
+ }
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state. Wait states are event processing states. No execution
+ * is performed by them. Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y". The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = NULL, \
+ .transitions = { __VA_ARGS__, { 0, NULL } } \
+ }
+
+#define TRANSIT_TO(state, emask) \
+ { .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT, "?INI",
+ TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
+ TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
+ TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+ TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
+ TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
+ TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables. These are for handling unexpected
+ * events, such as an I/O error. If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+ TRANSIT_TO(ABORT_INIT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+ TRANSIT_TO(LOOKUP_FAILURE,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
-EXPORT_SYMBOL(fscache_object_states);
-
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
- [FSCACHE_OBJECT_INIT] = "INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
- [FSCACHE_OBJECT_CREATING] = "CRTN",
- [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
- [FSCACHE_OBJECT_ACTIVE] = "ACTV",
- [FSCACHE_OBJECT_INVALIDATING] = "INVL",
- [FSCACHE_OBJECT_UPDATING] = "UPDT",
- [FSCACHE_OBJECT_DYING] = "DYNG",
- [FSCACHE_OBJECT_LC_DYING] = "LCDY",
- [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
- [FSCACHE_OBJECT_RELEASING] = "RELS",
- [FSCACHE_OBJECT_RECYCLING] = "RCYC",
- [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
- [FSCACHE_OBJECT_DEAD] = "DEAD",
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+ TRANSIT_TO(KILL_OBJECT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
static int fscache_get_object(struct fscache_object *);
static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
object->debug_id, parent->debug_id, parent->n_ops);
spin_lock_nested(&parent->lock, 1);
- parent->n_ops--;
parent->n_obj_ops--;
+ parent->n_ops--;
if (parent->n_ops == 0)
fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&parent->lock);
}
/*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
*/
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
{
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
- enum fscache_object_state new_state;
- struct fscache_cookie *cookie;
- int event;
+ const struct fscache_transition *t;
+ const struct fscache_state *state, *new_state;
+ unsigned long events, event_mask;
+ int event = -1;
ASSERT(object != NULL);
_enter("{OBJ%x,%s,%lx}",
- object->debug_id, fscache_object_states[object->state],
- object->events);
-
- switch (object->state) {
- /* wait for the parent object to become ready */
- case FSCACHE_OBJECT_INIT:
- object->event_mask =
- FSCACHE_OBJECT_EVENTS_MASK &
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- fscache_initialise_object(object);
- goto done;
-
- /* look up the object metadata on disk */
- case FSCACHE_OBJECT_LOOKING_UP:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* create the object metadata on disk */
- case FSCACHE_OBJECT_CREATING:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* handle an object becoming available; start pending
- * operations and queue dependent operations for processing */
- case FSCACHE_OBJECT_AVAILABLE:
- fscache_object_available(object);
- goto active_transit;
-
- /* normal running state */
- case FSCACHE_OBJECT_ACTIVE:
- goto active_transit;
-
- /* Invalidate an object on disk */
- case FSCACHE_OBJECT_INVALIDATING:
- clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
- fscache_stat(&fscache_n_invalidates_run);
- fscache_stat(&fscache_n_cop_invalidate_object);
- fscache_invalidate_object(object);
- fscache_stat_d(&fscache_n_cop_invalidate_object);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
- goto active_transit;
-
- /* update the object metadata on disk */
- case FSCACHE_OBJECT_UPDATING:
- clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
- goto active_transit;
-
- /* handle an object dying during lookup or creation */
- case FSCACHE_OBJECT_LC_DYING:
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DYING;
- cookie = object->cookie;
- if (cookie) {
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_LOOKING_UP);
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_CREATING);
+ object->debug_id, object->state->name, object->events);
+
+ event_mask = object->event_mask;
+restart:
+ object->event_mask = 0; /* Mask normal event handling */
+ state = object->state;
+restart_masked:
+ events = object->events;
+
+ /* Handle any out-of-band events (typically an error) */
+ if (events & object->oob_event_mask) {
+ _debug("{OBJ%x} oob %lx",
+ object->debug_id, events & object->oob_event_mask);
+ for (t = object->oob_table; t->events; t++) {
+ if (events & t->events) {
+ state = t->transit_to;
+ ASSERT(state->work != NULL);
+ event = fls(events & t->events) - 1;
+ __clear_bit(event, &object->oob_event_mask);
+ clear_bit(event, &object->events);
+ goto execute_work_state;
+ }
}
- spin_unlock(&object->lock);
+ }
- fscache_done_parent_op(object);
+ /* Wait states are just transition tables */
+ if (!state->work) {
+ if (events & event_mask) {
+ for (t = state->transitions; t->events; t++) {
+ if (events & t->events) {
+ new_state = t->transit_to;
+ event = fls(events & t->events) - 1;
+ clear_bit(event, &object->events);
+ _debug("{OBJ%x} ev %d: %s -> %s",
+ object->debug_id, event,
+ state->name, new_state->name);
+ object->state = state = new_state;
+ goto execute_work_state;
+ }
+ }
- /* wait for completion of all active operations on this object
- * and the death of all child objects of this object */
- case FSCACHE_OBJECT_DYING:
- dying:
- clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
- spin_lock(&object->lock);
- _debug("dying OBJ%x {%d,%d}",
- object->debug_id, object->n_ops, object->n_children);
- if (object->n_ops == 0 && object->n_children == 0) {
- object->event_mask &=
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- object->event_mask |=
- (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR);
- } else {
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- object->event_mask |=
- 1 << FSCACHE_OBJECT_EV_CLEARED;
+ /* The event mask didn't include all the tabled bits */
+ BUG();
}
- spin_unlock(&object->lock);
- fscache_enqueue_dependents(object);
- fscache_start_operations(object);
- goto terminal_transit;
-
- /* handle an abort during initialisation */
- case FSCACHE_OBJECT_ABORT_INIT:
- _debug("handle abort init %lx", object->events);
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
- spin_lock(&object->lock);
- fscache_dequeue_object(object);
-
- object->state = FSCACHE_OBJECT_DYING;
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &object->cookie->flags))
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_CREATING);
- spin_unlock(&object->lock);
- goto dying;
-
- /* handle the netfs releasing an object and possibly marking it
- * obsolete too */
- case FSCACHE_OBJECT_RELEASING:
- case FSCACHE_OBJECT_RECYCLING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_release_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* handle the parent cache of this object being withdrawn from
- * active service */
- case FSCACHE_OBJECT_WITHDRAWING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_withdraw_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* complain about the object being woken up once it is
- * deceased */
- case FSCACHE_OBJECT_DEAD:
- printk(KERN_ERR "FS-Cache:"
- " Unexpected event in dead state %lx\n",
- object->events & object->event_mask);
- BUG();
-
- default:
- printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
- object->state);
- BUG();
- }
-
- /* determine the transition from a lookup state */
-lookup_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_LC_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_REQUEUE:
- goto done;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ /* Randomly woke up */
+ goto unmask_events;
}
- /* determine the transition from an active state */
-active_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_UPDATE:
- new_state = FSCACHE_OBJECT_UPDATING;
- goto change_state;
- case -1:
- new_state = FSCACHE_OBJECT_ACTIVE;
- goto change_state; /* sleep until event */
- default:
- goto unsupported_event;
- }
+execute_work_state:
+ _debug("{OBJ%x} exec %s", object->debug_id, state->name);
- /* determine the transition from a terminal state */
-terminal_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RETIRE:
- new_state = FSCACHE_OBJECT_RECYCLING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RELEASE:
- new_state = FSCACHE_OBJECT_RELEASING;
- goto change_state;
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_CLEARED:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ new_state = state->work(object, event);
+ event = -1;
+ if (new_state == NO_TRANSIT) {
+ _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+ fscache_enqueue_object(object);
+ event_mask = object->oob_event_mask;
+ goto unmask_events;
}
-change_state:
- spin_lock(&object->lock);
- object->state = new_state;
- spin_unlock(&object->lock);
+ _debug("{OBJ%x} %s -> %s",
+ object->debug_id, state->name, new_state->name);
+ object->state = state = new_state;
-done:
- _leave(" [->%s]", fscache_object_states[object->state]);
- return;
+ if (state->work) {
+ if (unlikely(state->work == ((void *)2UL))) {
+ _leave(" [dead]");
+ return;
+ }
+ goto restart_masked;
+ }
-unsupported_event:
- printk(KERN_ERR "FS-Cache:"
- " Unsupported event %d [%lx/%lx] in state %s\n",
- event, object->events, object->event_mask,
- fscache_object_states[object->state]);
- BUG();
+ /* Transited to wait state */
+ event_mask = object->oob_event_mask;
+ for (t = state->transitions; t->events; t++)
+ event_mask |= t->events;
+
+unmask_events:
+ object->event_mask = event_mask;
+ smp_mb();
+ events = object->events;
+ if (events & event_mask)
+ goto restart;
+ _leave(" [msk %lx]", event_mask);
}
/*
* execute an object
*/
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
_enter("{OBJ%x}", object->debug_id);
start = jiffies;
- fscache_object_state_machine(object);
+ fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- if (object->events & object->event_mask)
- fscache_enqueue_object(object);
- clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
fscache_put_object(object);
}
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+ struct fscache_cookie *cookie,
+ struct fscache_cache *cache)
+{
+ const struct fscache_transition *t;
+
+ atomic_inc(&cache->object_count);
+
+ object->state = STATE(WAIT_FOR_INIT);
+ object->oob_table = fscache_osm_init_oob;
+ object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+ spin_lock_init(&object->lock);
+ INIT_LIST_HEAD(&object->cache_link);
+ INIT_HLIST_NODE(&object->cookie_link);
+ INIT_WORK(&object->work, fscache_object_work_func);
+ INIT_LIST_HEAD(&object->dependents);
+ INIT_LIST_HEAD(&object->dep_link);
+ INIT_LIST_HEAD(&object->pending_ops);
+ object->n_children = 0;
+ object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+ object->events = 0;
+ object->store_limit = 0;
+ object->store_limit_l = 0;
+ object->cache = cache;
+ object->cookie = cookie;
+ object->parent = NULL;
+
+ object->oob_event_mask = 0;
+ for (t = object->oob_table; t->events; t++)
+ object->oob_event_mask |= t->events;
+ object->event_mask = object->oob_event_mask;
+ for (t = object->state->transitions; t->events; t++)
+ object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_event_mask = 0;
+ fscache_dequeue_object(object);
+ return transit_to(KILL_OBJECT);
+}
/*
* initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
* immediately to do a creation
* - we may need to start the process of creating a parent and we need to wait
* for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+ int event)
{
struct fscache_object *parent;
+ bool success;
- _enter("");
- ASSERT(object->cookie != NULL);
- ASSERT(object->cookie->parent != NULL);
-
- if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
- _debug("abort init %lx", object->events);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_ABORT_INIT;
- spin_unlock(&object->lock);
- return;
- }
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->cookie->lock);
- spin_lock_nested(&object->cookie->parent->lock, 1);
+ ASSERT(list_empty(&object->dep_link));
parent = object->parent;
if (!parent) {
- _debug("no parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else {
- spin_lock(&object->lock);
- spin_lock_nested(&parent->lock, 1);
- _debug("parent %s", fscache_object_states[parent->state]);
-
- if (parent->state >= FSCACHE_OBJECT_DYING) {
- _debug("bad parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
- _debug("wait");
-
- /* we may get woken up in this state by child objects
- * binding on to us, so we need to make sure we don't
- * add ourself to the list multiple times */
- if (list_empty(&object->dep_link)) {
- fscache_stat(&fscache_n_cop_grab_object);
- object->cache->ops->grab_object(object);
- fscache_stat_d(&fscache_n_cop_grab_object);
- list_add(&object->dep_link,
- &parent->dependents);
-
- /* fscache_acquire_non_index_cookie() uses this
- * to wake the chain up */
- if (parent->state == FSCACHE_OBJECT_INIT)
- fscache_enqueue_object(parent);
- }
- } else {
- _debug("go");
- parent->n_ops++;
- parent->n_obj_ops++;
- object->lookup_jif = jiffies;
- object->state = FSCACHE_OBJECT_LOOKING_UP;
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- }
+ _leave(" [no parent]");
+ return transit_to(DROP_OBJECT);
+ }
- spin_unlock(&parent->lock);
- spin_unlock(&object->lock);
+ _debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+ if (fscache_object_is_dying(parent)) {
+ _leave(" [bad parent]");
+ return transit_to(DROP_OBJECT);
}
- spin_unlock(&object->cookie->parent->lock);
- spin_unlock(&object->cookie->lock);
+ if (fscache_object_is_available(parent)) {
+ _leave(" [ready]");
+ return transit_to(PARENT_READY);
+ }
+
+ _debug("wait");
+
+ spin_lock(&parent->lock);
+ fscache_stat(&fscache_n_cop_grab_object);
+ success = false;
+ if (fscache_object_is_live(parent) &&
+ object->cache->ops->grab_object(object)) {
+ list_add(&object->dep_link, &parent->dependents);
+ success = true;
+ }
+ fscache_stat_d(&fscache_n_cop_grab_object);
+ spin_unlock(&parent->lock);
+ if (!success) {
+ _leave(" [grab failed]");
+ return transit_to(DROP_OBJECT);
+ }
+
+ /* fscache_acquire_non_index_cookie() uses this
+ * to wake the chain up */
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+ _leave(" [wait]");
+ return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+ int event)
+{
+ struct fscache_object *parent = object->parent;
+
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ ASSERT(parent != NULL);
+
+ spin_lock(&parent->lock);
+ parent->n_ops++;
+ parent->n_obj_ops++;
+ object->lookup_jif = jiffies;
+ spin_unlock(&parent->lock);
+
_leave("");
+ return transit_to(LOOK_UP_OBJECT);
}
/*
* look an object up in the cache from which it was allocated
* - we hold an "access lock" on the parent object, so the parent object cannot
* be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+ int event)
{
struct fscache_cookie *cookie = object->cookie;
- struct fscache_object *parent;
+ struct fscache_object *parent = object->parent;
int ret;
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_table = fscache_osm_lookup_oob;
- parent = object->parent;
ASSERT(parent != NULL);
ASSERTCMP(parent->n_ops, >, 0);
ASSERTCMP(parent->n_obj_ops, >, 0);
/* make sure the parent is still available */
- ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
-
- if (parent->state >= FSCACHE_OBJECT_DYING ||
- test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
- _debug("unavailable");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- _leave("");
- return;
+ ASSERT(fscache_object_is_available(parent));
+
+ if (fscache_object_is_dying(parent) ||
+ test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+ !fscache_use_cookie(object)) {
+ _leave(" [unavailable]");
+ return transit_to(LOOKUP_FAILURE);
}
- _debug("LOOKUP \"%s/%s\" in \"%s\"",
- parent->cookie->def->name, cookie->def->name,
- object->cache->tag->name);
+ _debug("LOOKUP \"%s\" in \"%s\"",
+ cookie->def->name, object->cache->tag->name);
fscache_stat(&fscache_n_object_lookups);
fscache_stat(&fscache_n_cop_lookup_object);
ret = object->cache->ops->lookup_object(object);
fscache_stat_d(&fscache_n_cop_lookup_object);
- if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ fscache_unuse_cookie(object);
if (ret == -ETIMEDOUT) {
/* probably stuck behind another object, so move this one to
* the back of the queue */
fscache_stat(&fscache_n_object_lookups_timed_out);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ _leave(" [timeout]");
+ return NO_TRANSIT;
}
- _leave("");
+ if (ret < 0) {
+ _leave(" [error]");
+ return transit_to(LOOKUP_FAILURE);
+ }
+
+ _leave(" [ok]");
+ return transit_to(OBJECT_AVAILABLE);
}
/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_negative);
- /* transit here to allow write requests to begin stacking up
- * and read requests to begin returning ENODATA */
- object->state = FSCACHE_OBJECT_CREATING;
- spin_unlock(&object->lock);
-
- set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+ /* Allow write requests to begin stacking up and read requests to begin
+ * returning ENODATA.
+ */
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
_debug("wake up lookup %p", &cookie->flags);
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- } else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
- spin_unlock(&object->lock);
}
-
_leave("");
}
EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
/* if we were still looking up, then we must have a positive lookup
* result, in which case there may be data available */
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_positive);
- clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ /* We do (presumably) have data */
+ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
-
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ /* Allow write requests to begin stacking up and read requests
+ * to begin shovelling data.
+ */
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
} else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
fscache_stat(&fscache_n_object_created);
-
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- smp_wmb();
}
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+ set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
_leave("");
}
EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
/*
* handle an object that has just become available
*/
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+ int event)
{
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->lock);
+ object->oob_table = fscache_osm_run_oob;
- if (object->cookie &&
- test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
- wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+ spin_lock(&object->lock);
fscache_done_parent_op(object);
if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
- fscache_enqueue_dependents(object);
fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
fscache_stat(&fscache_n_object_avail);
_leave("");
+ return transit_to(JUMPSTART_DEPS);
}
/*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
*/
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+ int event)
{
- struct fscache_object *parent = object->parent;
- struct fscache_cache *cache = object->cache;
+ _enter("{OBJ%x},%d", object->debug_id, event);
- _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+ return NO_TRANSIT; /* Not finished; requeue */
+ return transit_to(WAIT_FOR_CMD);
+}
- ASSERTCMP(object->cookie, ==, NULL);
- ASSERT(hlist_unhashed(&object->cookie_link));
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+ int event)
+{
+ struct fscache_cookie *cookie;
- spin_lock(&cache->object_list_lock);
- list_del_init(&object->cache_link);
- spin_unlock(&cache->object_list_lock);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_cop_drop_object);
- cache->ops->drop_object(object);
- fscache_stat_d(&fscache_n_cop_drop_object);
+ object->oob_event_mask = 0;
- if (parent) {
- _debug("release parent OBJ%x {%d}",
- parent->debug_id, parent->n_children);
+ fscache_stat(&fscache_n_cop_lookup_complete);
+ object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
- spin_lock(&parent->lock);
- parent->n_children--;
- if (parent->n_children == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
- object->parent = NULL;
+ cookie = object->cookie;
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+ fscache_done_parent_op(object);
+ return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x,%d,%d},%d",
+ object->debug_id, object->n_ops, object->n_children, event);
+
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ object->oob_event_mask = 0;
+
+ if (list_empty(&object->dependents) &&
+ object->n_ops == 0 &&
+ object->n_children == 0)
+ return transit_to(DROP_OBJECT);
+
+ if (object->n_in_progress == 0) {
+ spin_lock(&object->lock);
+ if (object->n_ops > 0 && object->n_in_progress == 0)
+ fscache_start_operations(object);
+ spin_unlock(&object->lock);
}
- /* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ if (!list_empty(&object->dependents))
+ return transit_to(KILL_DEPENDENTS);
- _leave("");
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
*/
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+ int event)
{
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_drop_object(object);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+ return NO_TRANSIT; /* Not finished */
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * withdraw an object from active service
+ * Drop an object's attachments
*/
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+ int event)
{
- struct fscache_cookie *cookie;
- bool detached;
+ struct fscache_object *parent = object->parent;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_cache *cache = object->cache;
+ bool awaken = false;
- _enter("");
+ _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
- spin_lock(&object->lock);
- cookie = object->cookie;
- if (cookie) {
- /* need to get the cookie lock before the object lock, starting
- * from the object pointer */
- atomic_inc(&cookie->usage);
- spin_unlock(&object->lock);
+ ASSERT(cookie != NULL);
+ ASSERT(!hlist_unhashed(&object->cookie_link));
- detached = false;
- spin_lock(&cookie->lock);
- spin_lock(&object->lock);
+ /* Make sure the cookie no longer points here and that the netfs isn't
+ * waiting for us.
+ */
+ spin_lock(&cookie->lock);
+ hlist_del_init(&object->cookie_link);
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ awaken = true;
+ spin_unlock(&cookie->lock);
- if (object->cookie == cookie) {
- hlist_del_init(&object->cookie_link);
- object->cookie = NULL;
- fscache_invalidation_complete(cookie);
- detached = true;
- }
- spin_unlock(&cookie->lock);
- fscache_cookie_put(cookie);
- if (detached)
- fscache_cookie_put(cookie);
- }
+ if (awaken)
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ /* Prevent a race with our last child, which has to signal EV_CLEARED
+ * before dropping our spinlock.
+ */
+ spin_lock(&object->lock);
spin_unlock(&object->lock);
- fscache_drop_object(object);
-}
+ /* Discard from the cache's collection of objects */
+ spin_lock(&cache->object_list_lock);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- * (1) recycler decides to reclaim an in-use object
- * (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- * simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
- struct fscache_object *object)
-{
- bool enqueue = false;
+ fscache_stat(&fscache_n_cop_drop_object);
+ cache->ops->drop_object(object);
+ fscache_stat_d(&fscache_n_cop_drop_object);
- _enter(",OBJ%x", object->debug_id);
+ /* The parent object wants to know when all it dependents have gone */
+ if (parent) {
+ _debug("release parent OBJ%x {%d}",
+ parent->debug_id, parent->n_children);
- spin_lock(&object->lock);
- if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
- object->state = FSCACHE_OBJECT_WITHDRAWING;
- enqueue = true;
+ spin_lock(&parent->lock);
+ parent->n_children--;
+ if (parent->n_children == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+ object->parent = NULL;
}
- spin_unlock(&object->lock);
- if (enqueue)
- fscache_enqueue_object(object);
+ /* this just shifts the object release to the work processor */
+ fscache_put_object(object);
+ fscache_stat(&fscache_n_object_dead);
_leave("");
+ return transit_to(OBJECT_DEAD);
}
/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
}
/*
- * discard a ref on a work item
+ * Discard a ref on an object
*/
static void fscache_put_object(struct fscache_object *object)
{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
fscache_stat_d(&fscache_n_cop_put_object);
}
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+ fscache_objlist_remove(object);
+
+ /* We can get rid of the cookie now */
+ fscache_cookie_put(object->cookie);
+ object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
/*
* enqueue an object for metadata-type processing
*/
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
/**
* fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
*
* Allow an object handler to sleep until the object workqueue is congested.
*
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
/*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately. We return true if the list was
+ * cleared.
*/
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
{
struct fscache_object *dep;
+ bool ret = true;
_enter("{OBJ%x}", object->debug_id);
if (list_empty(&object->dependents))
- return;
+ return true;
spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
struct fscache_object, dep_link);
list_del_init(&dep->dep_link);
-
- /* sort onto appropriate lists */
- fscache_enqueue_object(dep);
+ fscache_raise_event(dep, event);
fscache_put_object(dep);
- if (!list_empty(&object->dependents))
- cond_resched_lock(&object->lock);
+ if (!list_empty(&object->dependents) && need_resched()) {
+ ret = false;
+ break;
+ }
}
spin_unlock(&object->lock);
+ return ret;
}
/*
* remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
*/
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
*
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
/*
* Asynchronously invalidate an object.
*/
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+ int event)
{
struct fscache_operation *op;
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ /* We're going to need the cookie. If the cookie is not available then
+ * retire the object instead.
+ */
+ if (!fscache_use_cookie(object)) {
+ ASSERT(object->cookie->stores.rnode == NULL);
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ _leave(" [no cookie]");
+ return transit_to(KILL_OBJECT);
+ }
/* Reject any new read/write ops and abort any that are pending. */
fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* Now we have to wait for in-progress reads and writes */
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
- _leave(" [ENOMEM]");
- return;
- }
+ if (!op)
+ goto nomem;
fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
- op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+ op->flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_EXCLUSIVE) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* We can allow read and write requests to come in once again. They'll
* queue up behind our exclusive invalidation operation.
*/
- fscache_invalidation_complete(cookie);
- _leave("");
- return;
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ _leave(" [ok]");
+ return transit_to(UPDATE_OBJECT);
+
+nomem:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_unuse_cookie(object);
+ _leave(" [ENOMEM]");
+ return transit_to(KILL_OBJECT);
submit_op_failed:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
kfree(op);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
_leave(" [EIO]");
+ return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+ int event)
+{
+ const struct fscache_state *s;
+
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ s = _fscache_invalidate_object(object, event);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+
+ _leave("");
+ return transit_to(WAIT_FOR_CMD);
}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
- ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+ ASSERT(fscache_object_is_available(op->object));
ASSERTCMP(atomic_read(&op->usage), >, 0);
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
/* need to issue a new write op after this */
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
*/
static void fscache_report_unexpected_submission(struct fscache_object *object,
struct fscache_operation *op,
- unsigned long ostate)
+ const struct fscache_state *ostate)
{
static bool once_only;
struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
once_only = true;
kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id,
- fscache_object_states[object->state]);
- kdebug("objstate=%s [%s]",
- fscache_object_states[object->state],
- fscache_object_states[ostate]);
+ op->debug_id, object->debug_id, object->state->name);
+ kdebug("objstate=%s [%s]", object->state->name, ostate->name);
kdebug("objflags=%lx", object->flags);
kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
int fscache_submit_op(struct fscache_object *object,
struct fscache_operation *op)
{
- unsigned long ostate;
+ const struct fscache_state *ostate;
int ret;
_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
fscache_run_op(object, op);
}
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_DYING ||
- object->state == FSCACHE_OBJECT_LC_DYING ||
- object->state == FSCACHE_OBJECT_WITHDRAWING) {
+ } else if (fscache_object_is_dying(object)) {
fscache_stat(&fscache_n_op_rejected);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
}
/*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object. The caller must hold
+ * object->lock.
*/
void fscache_start_operations(struct fscache_object *object)
{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
- if (atomic_dec_and_test(&object->n_reads)) {
- clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
- &object->cookie->flags);
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_WAITING_ON_READS);
- }
- }
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+ fscache_unuse_cookie(object);
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT)) {
+ if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object)) {
+ if (fscache_object_is_active(object) &&
+ fscache_use_cookie(object)) {
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
+ fscache_unuse_cookie(object);
if (ret < 0)
fscache_abort_object(object);
}
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
- ASSERTCMP(op->n_pages, ==, 0);
+ ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
* allocate a retrieval op
*/
static struct fscache_retrieval *fscache_alloc_retrieval(
+ struct fscache_cookie *cookie,
struct address_space *mapping,
fscache_rw_complete_t end_io_func,
void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
}
fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
- op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+ atomic_inc(&cookie->n_active);
+ op->op.flags = FSCACHE_OP_MYTHREAD |
+ (1UL << FSCACHE_OP_WAITING) |
+ (1UL << FSCACHE_OP_UNUSE_COOKIE);
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
struct fscache_retrieval *op =
container_of(_op, struct fscache_retrieval, op);
- op->n_pages = 0;
+ atomic_set(&op->n_pages, 0);
}
/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, page->mapping,
+ end_io_func,context);
if (!op) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+ ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- op->n_pages = *nr_pages;
+ atomic_set(&op->n_pages, *nr_pages);
spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+ op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
*/
spin_unlock(&object->lock);
fscache_op_complete(&op->op, false);
- _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
- _op->flags, _op->state, object->state, object->flags);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+ _op->flags, _op->state, object->state->short_name,
+ object->flags);
return;
}
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
_enter("");
- while (spin_lock(&cookie->stores_lock),
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
- ARRAY_SIZE(results),
- FSCACHE_COOKIE_PENDING_TAG),
- n > 0) {
+ for (;;) {
+ spin_lock(&cookie->stores_lock);
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG);
+ if (n == 0) {
+ spin_unlock(&cookie->stores_lock);
+ break;
+ }
+
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
page_cache_release(results[i]);
}
- spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
* (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
* set)
*
- * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- * fill op)
+ * (a) no writes yet
*
* (b) writes deferred till post-creation (mark page for writing and
* return immediately)
*
* (2) negative lookup, object created, initial fill being made from netfs
- * (FSCACHE_COOKIE_INITIAL_FILL is set)
*
* (a) fill point not yet reached this page (mark page for writing and
* return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
- op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+ op->op.flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_WAITING) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
op->store_limit = object->store_limit;
+ atomic_inc(&cookie->n_active);
if (fscache_submit_op(object, &op->op) < 0)
goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
return 0;
submit_failed:
+ atomic_dec(&cookie->n_active);
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0eda52738ec4..72a5d5b04494 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1223,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file,
if (name.name[1] == '.' && name.len == 2)
return 0;
}
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
fc = get_fuse_conn(dir);
name.hash = full_name_hash(name.name, name.len);
dentry = d_lookup(parent, &name);
- if (dentry && dentry->d_inode) {
+ if (dentry) {
inode = dentry->d_inode;
- if (get_node_id(inode) == o->nodeid) {
+ if (!inode) {
+ d_drop(dentry);
+ } else if (get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ err = d_invalidate(dentry);
+ if (err)
+ goto out;
+ } else if (is_bad_inode(inode)) {
+ err = -EIO;
+ goto out;
+ } else {
struct fuse_inode *fi;
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+
/*
* The other branch to 'found' comes via fuse_iget()
* which bumps nlookup inside
*/
goto found;
}
- err = d_invalidate(dentry);
- if (err)
- goto out;
dput(dentry);
- dentry = NULL;
}
dentry = d_alloc(parent, &name);
@@ -1259,25 +1275,30 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- alias = d_materialise_unique(dentry, inode);
- err = PTR_ERR(alias);
- if (IS_ERR(alias))
- goto out;
+ if (S_ISDIR(inode->i_mode)) {
+ mutex_lock(&fc->inst_mutex);
+ alias = fuse_d_add_directory(dentry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias)) {
+ iput(inode);
+ goto out;
+ }
+ } else {
+ alias = d_splice_alias(inode, dentry);
+ }
+
if (alias) {
dput(dentry);
dentry = alias;
}
found:
- fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
- attr_version);
-
fuse_change_entry_timeout(dentry, o);
err = 0;
out:
- if (dentry)
- dput(dentry);
+ dput(dentry);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bfb20a8642c3..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2469,13 +2469,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE);
if (fc->no_fallocate)
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (lock_inode) {
mutex_lock(&inode->i_mutex);
- fuse_set_nowrite(inode);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_set_nowrite(inode);
}
req = fuse_get_req_nopages(fc);
@@ -2510,8 +2513,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
fuse_invalidate_attr(inode);
out:
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- fuse_release_nowrite(inode);
+ if (lock_inode) {
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_release_nowrite(inode);
mutex_unlock(&inode->i_mutex);
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
static void sanitize_global_limit(unsigned *limit)
{
if (*limit == 0)
- *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
sizeof(struct fuse_req);
if (*limit >= 1 << 16)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5a376ab81feb..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
be found here: http://sources.redhat.com/cluster
The "nolock" lock module is now built in to GFS2 by default. If
- you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
- networking.
+ you want to use the DLM, be sure to enable IPv4/6 networking.
config GFS2_FS_LOCKING_DLM
bool "GFS2 DLM locking"
depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
- HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
+ CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
help
Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
goto out;
}
return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
/* Is the page fully outside i_size? (truncate in progress) */
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0,
+ PAGE_CACHE_SIZE);
unlock_page(page);
continue;
}
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
goto out;
bh = head = page_buffers(page);
do {
+ if (pos + bh->b_size > stop)
+ return;
+
if (offset <= pos)
gfs2_discard(sdp, bh);
pos += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
out:
- if (offset == 0)
+ if (!partial_page)
try_to_release_page(page, 0);
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809c20bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
unstuff = 1;
}
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+ 0 : RES_QUOTA), 0);
if (error)
goto do_grow_release;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index e0449c10286a..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1125,13 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
if (IS_ERR(hc))
return PTR_ERR(hc);
- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+ hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
if (hc2 == NULL)
hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
if (!hc2)
return -ENOMEM;
+ h = hc2;
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
goto out_kfree;
@@ -1547,9 +1548,9 @@ out:
/**
* gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
*
* This routine searches a directory for a file or another directory.
* Assumes a glock is held on dip.
@@ -1557,22 +1558,25 @@ out:
* Returns: errno
*/
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ bool fail_on_exist)
{
struct buffer_head *bh;
struct gfs2_dirent *dent;
- struct inode *inode;
+ u64 addr, formal_ino;
+ u16 dtype;
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
if (IS_ERR(dent))
return ERR_CAST(dent);
- inode = gfs2_inode_lookup(dir->i_sb,
- be16_to_cpu(dent->de_type),
- be64_to_cpu(dent->de_inum.no_addr),
- be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+ dtype = be16_to_cpu(dent->de_type);
+ addr = be64_to_cpu(dent->de_inum.no_addr);
+ formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
- return inode;
+ if (fail_on_exist)
+ return ERR_PTR(-EEXIST);
+ return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
}
return ERR_PTR(-ENOENT);
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ba9000bc1397..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,7 +18,8 @@ struct gfs2_inode;
struct gfs2_inum;
extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename);
+ const struct qstr *filename,
+ bool fail_on_exist);
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cebfd404c1d9..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -531,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
*
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
*/
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder i_gh;
struct gfs2_file *fp;
- int error;
+ int ret;
- fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+ if (S_ISREG(inode->i_mode)) {
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+ }
+
+ fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
if (!fp)
return -ENOMEM;
@@ -553,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+ bool need_unlock = false;
if (S_ISREG(ip->i_inode.i_mode)) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (error)
- goto fail;
+ return error;
+ need_unlock = true;
+ }
- if (!(file->f_flags & O_LARGEFILE) &&
- i_size_read(inode) > MAX_NON_LFS) {
- error = -EOVERFLOW;
- goto fail_gunlock;
- }
+ error = gfs2_open_common(inode, file);
+ if (need_unlock)
gfs2_glock_dq_uninit(&i_gh);
- }
-
- return 0;
-fail_gunlock:
- gfs2_glock_dq_uninit(&i_gh);
-fail:
- file->private_data = NULL;
- kfree(fp);
return error;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..e2e0a90396e7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -47,32 +47,28 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
* None of the buffers should be dirty, locked, or pinned.
*/
-static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
+ unsigned int nr_revokes)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct list_head *head = &gl->gl_ail_list;
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
- sector_t blocknr;
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
+ list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) {
+ if (nr_revokes == 0)
+ break;
bh = bd->bd_bh;
if (bh->b_state & b_state) {
if (fsync)
continue;
gfs2_ail_error(gl, bh);
}
- blocknr = bh->b_blocknr;
- bh->b_private = NULL;
- gfs2_remove_from_ail(bd); /* drops ref on bh */
-
- bd->bd_bh = NULL;
- bd->bd_blkno = blocknr;
-
gfs2_trans_add_revoke(sdp, bd);
+ nr_revokes--;
}
GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
spin_unlock(&sdp->sd_ail_lock);
@@ -99,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
- __gfs2_ail_flush(gl, 0);
+ __gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
@@ -109,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
+ unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
int ret;
if (!revokes)
return;
- ret = gfs2_trans_begin(sdp, 0, revokes);
+ while (revokes > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+
+ ret = gfs2_trans_begin(sdp, 0, max_revokes);
if (ret)
return;
- __gfs2_ail_flush(gl, fsync);
+ __gfs2_ail_flush(gl, fsync, max_revokes);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..a01b8fd3a1c1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -313,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
goto out;
}
- inode = gfs2_dir_search(dir, name);
+ inode = gfs2_dir_search(dir, name, false);
if (IS_ERR(inode))
error = PTR_ERR(inode);
out:
@@ -346,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
if (!dip->i_inode.i_nlink)
return -ENOENT;
- error = gfs2_dir_check(&dip->i_inode, name, NULL);
- switch (error) {
- case -ENOENT:
- error = 0;
- break;
- case 0:
- return -EEXIST;
- default:
- return error;
- }
-
if (dip->i_entries == (u32)-1)
return -EFBIG;
if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
* @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
* @mode: The permissions on the new inode
* @dev: For device nodes, this is the device number
* @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
*/
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+ struct file *file,
umode_t mode, dev_t dev, const char *symname,
- unsigned int size, int excl)
+ unsigned int size, int excl, int *opened)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
+ struct dentry *d;
int error;
u32 aflags = 0;
int arq;
@@ -584,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail;
error = create_ok(dip, name, mode);
- if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- gfs2_glock_dq_uninit(ghs);
- d_instantiate(dentry, inode);
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
- }
if (error)
goto fail_gunlock;
+ inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+ error = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ d = d_splice_alias(inode, dentry);
+ error = 0;
+ if (file && !IS_ERR(d)) {
+ if (d == NULL)
+ d = dentry;
+ if (S_ISREG(inode->i_mode))
+ error = finish_open(file, d, gfs2_open_common, opened);
+ else
+ error = finish_no_open(file, d);
+ }
+ gfs2_glock_dq_uninit(ghs);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ return error;
+ } else if (error != -ENOENT) {
+ goto fail_gunlock;
+ }
+
arq = error = gfs2_diradd_alloc_required(dir, name);
if (error < 0)
goto fail_gunlock;
@@ -686,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_gunlock3;
mark_inode_dirty(inode);
+ d_instantiate(dentry, inode);
+ if (file)
+ error = finish_open(file, dentry, gfs2_open_common, opened);
gfs2_glock_dq_uninit(ghs);
gfs2_glock_dq_uninit(ghs + 1);
- d_instantiate(dentry, inode);
- return 0;
+ return error;
fail_gunlock3:
gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
}
/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
* @dir: The directory inode
* @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
*
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
*
* Returns: errno
*/
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct file *file, int *opened)
{
- struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode && !IS_ERR(inode)) {
- struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
- struct gfs2_holder gh;
- int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error) {
- iput(inode);
- return ERR_PTR(error);
- }
- gfs2_glock_dq_uninit(&gh);
+ struct inode *inode;
+ struct dentry *d;
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl;
+ int error;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (!inode)
+ return NULL;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ gl = GFS2_I(inode)->i_gl;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
}
- return d_splice_alias(inode, dentry);
+
+ d = d_splice_alias(inode, dentry);
+ if (file && S_ISREG(inode->i_mode))
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+
+ gfs2_glock_dq_uninit(&gh);
+ if (error)
+ return ERR_PTR(error);
+ return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned flags)
+{
+ return __gfs2_lookup(dir, dentry, NULL, NULL);
}
/**
@@ -1076,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
}
/**
@@ -1092,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct gfs2_sbd *sdp = GFS2_SB(dir);
unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
}
/**
@@ -1107,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+ return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ struct dentry *d;
+ bool excl = !!(flags & O_EXCL);
+
+ d = __gfs2_lookup(dir, dentry, file, opened);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ if (d == NULL)
+ d = dentry;
+ if (d->d_inode) {
+ if (!(*opened & FILE_OPENED))
+ return finish_no_open(file, d);
+ return 0;
+ }
+
+ if (!(flags & O_CREAT))
+ return -ENOENT;
+
+ return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
}
/*
@@ -1787,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .atomic_open = gfs2_atomic_open,
};
const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr, *s;
+ int oldest_tr = 1;
int ret;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
gfs2_ail1_empty_one(sdp, tr);
- if (list_empty(&tr->tr_ail1_list))
+ if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
- break;
+ oldest_tr = 0;
}
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
- unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+ unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
spin_unlock(&sdp->sd_ordered_lock);
}
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+ struct buffer_head *bh = bd->bd_bh;
+ struct gfs2_glock *gl = bd->bd_gl;
+
+ gfs2_remove_from_ail(bd);
+ bd->bd_bh = NULL;
+ bh->b_private = NULL;
+ bd->bd_blkno = bh->b_blocknr;
+ bd->bd_ops = &gfs2_revoke_lops;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd, *tmp;
+ int have_revokes = 0;
+ int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+ gfs2_ail1_empty(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (list_empty(&bd->bd_list)) {
+ have_revokes = 1;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock(&sdp->sd_ail_lock);
+ if (have_revokes == 0)
+ return;
+ while (sdp->sd_log_num_revoke > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ max_revokes -= sdp->sd_log_num_revoke;
+ if (!sdp->sd_log_num_revoke) {
+ atomic_dec(&sdp->sd_log_blks_free);
+ /* If no blocks have been reserved, we need to also
+ * reserve a block for the header */
+ if (!sdp->sd_log_blks_reserved)
+ atomic_dec(&sdp->sd_log_blks_free);
+ }
+ gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (max_revokes == 0)
+ goto out_of_blocks;
+ if (!list_empty(&bd->bd_list))
+ continue;
+ gfs2_add_revoke(sdp, bd);
+ max_revokes--;
+ }
+ }
+out_of_blocks:
+ spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
+
+ if (!sdp->sd_log_num_revoke) {
+ atomic_inc(&sdp->sd_log_blks_free);
+ if (!sdp->sd_log_blks_reserved)
+ atomic_inc(&sdp->sd_log_blks_free);
+ }
+}
+
/**
* log_write_header - Get and initialize a journal header buffer
* @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
lh = page_address(page);
clear_page(lh);
- gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b6e0c4..010b9fb9fec6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
kunmap_atomic(kaddr);
}
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_bufdata *bda, *bdb;
+
+ bda = list_entry(a, struct gfs2_bufdata, bd_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+ if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ return -1;
+ if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ return 1;
+ return 0;
+}
+
static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
unsigned int total, struct list_head *blist,
bool is_databuf)
@@ -413,6 +428,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
__be64 *ptr;
gfs2_log_lock(sdp);
+ list_sort(NULL, blist, blocknr_cmp);
bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
while(total) {
num = total;
@@ -563,6 +579,24 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
return error;
}
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+static void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+ struct address_space *mapping = gfs2_glock2aspace(gl);
+ int error;
+
+ filemap_fdatawrite(mapping);
+ error = filemap_fdatawait(mapping);
+
+ if (error)
+ gfs2_io_error(gl->gl_sbd);
+}
+
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -590,6 +624,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
struct page *page;
unsigned int length;
+ gfs2_write_revokes(sdp);
if (!sdp->sd_log_num_revoke)
return;
@@ -836,10 +871,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
.lo_name = "revoke",
};
-const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_name = "rg",
-};
-
const struct gfs2_log_operations gfs2_databuf_lops = {
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +882,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
const struct gfs2_log_operations *gfs2_log_ops[] = {
&gfs2_databuf_lops,
&gfs2_buf_lops,
- &gfs2_rg_lops,
&gfs2_revoke_lops,
NULL,
};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
extern const struct gfs2_log_operations gfs2_glock_lops;
extern const struct gfs2_log_operations gfs2_buf_lops;
extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index e04d0e09ee7b..7b0f5043cf24 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void)
goto fail_wq;
gfs2_control_wq = alloc_workqueue("gfs2_control",
- WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+ WQ_UNBOUND | WQ_FREEZABLE, 0);
if (!gfs2_control_wq)
goto fail_recovery;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..932415050540 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -98,24 +98,6 @@ const struct address_space_operations gfs2_meta_aops = {
};
/**
- * gfs2_meta_sync - Sync all buffers associated with a glock
- * @gl: The glock
- *
- */
-
-void gfs2_meta_sync(struct gfs2_glock *gl)
-{
- struct address_space *mapping = gfs2_glock2aspace(gl);
- int error;
-
- filemap_fdatawrite(mapping);
- error = filemap_fdatawait(mapping);
-
- if (error)
- gfs2_io_error(gl->gl_sbd);
-}
-
-/**
* gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
* @blkno: the block number (filesystem scope)
@@ -296,10 +278,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
if (bd) {
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_tr) {
- gfs2_remove_from_ail(bd);
- bh->b_private = NULL;
- bd->bd_bh = NULL;
- bd->bd_blkno = bh->b_blocknr;
gfs2_trans_add_revoke(sdp, bd);
}
spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 0d4c843b6f8e..4823b934208a 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -48,21 +48,17 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
return inode->i_sb->s_fs_info;
}
-void gfs2_meta_sync(struct gfs2_glock *gl);
-
-struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
- int flags, struct buffer_head **bhp);
-int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
-
-void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
- int meta);
-
-void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-
-int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
- struct buffer_head **bhp);
+extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+ struct buffer_head **bhp);
+extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+ int create);
+extern void gfs2_remove_from_journal(struct buffer_head *bh,
+ struct gfs2_trans *tr, int meta);
+extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+ struct buffer_head **bhp);
static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
goto fail_quotad;
p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start logd thread: %d\n", error);
return error;
}
sdp->sd_logd_process = p;
p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start quotad thread: %d\n", error);
goto fail;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13722e8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
- return gfs2_quota_sync(sb, type);
-}
-
int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156e3d04..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1288,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
minlen = max_t(u64, r.minlen,
q->limits.discard_granularity) >> bs_shift;
+ if (end <= start || minlen > sdp->sd_max_rg_data)
+ return -EINVAL;
+
rgd = gfs2_blk2rgrpd(sdp, start, 0);
- rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
- if (end <= start ||
- minlen > sdp->sd_max_rg_data ||
- start > rgd_end->rd_data0 + rgd_end->rd_data)
- return -EINVAL;
+ if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+ && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+ return -EINVAL; /* start is beyond the end of the fs */
while (1) {
@@ -1336,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
}
out:
- r.len = trimmed << 9;
+ r.len = trimmed << bs_shift;
if (copy_to_user(argp, &r, sizeof(r)))
return -EFAULT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr = current->journal_info;
BUG_ON(!list_empty(&bd->bd_list));
- BUG_ON(!list_empty(&bd->bd_ail_st_list));
- BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- bd->bd_ops = &gfs2_revoke_lops;
+ gfs2_add_revoke(sdp, bd);
tr->tr_touched = 1;
tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2e..4d0a1afa058c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
*/
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include "hpfs_fn.h"
+void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
+{
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+ return;
+
+ bh = sb_find_get_block(s, secno);
+ if (bh) {
+ if (buffer_uptodate(bh)) {
+ brelse(bh);
+ return;
+ }
+ brelse(bh);
+ };
+
+ blk_start_plug(&plug);
+ while (n > 0) {
+ if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+ break;
+ sb_breadahead(s, secno);
+ secno++;
+ n--;
+ }
+ blk_finish_plug(&plug);
+}
+
/* Map a sector into a buffer and return pointers to it and to the buffer. */
void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
hpfs_lock_assert(s);
+ hpfs_prefetch_sectors(s, secno, ahead);
+
cond_resched();
*bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
return NULL;
}
+ hpfs_prefetch_sectors(s, secno, 4 + ahead);
+
qbh->data = data = kmalloc(2048, GFP_NOFS);
if (!data) {
printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index e4ba5fe4c3b5..4e9dabcf1f4c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
*/
#include "hpfs_fn.h"
+#include <linux/mpage.h>
#define BLOCKS(size) (((size) + 511) >> 9)
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
* so we must ignore such errors.
*/
-static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
{
struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
struct buffer_head *bh;
if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
n = file_secno - hpfs_inode->i_file_sec;
- if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n;
+ if (n < hpfs_inode->i_n_secs) {
+ *n_secs = hpfs_inode->i_n_secs - n;
+ return hpfs_inode->i_disk_sec + n;
+ }
if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
if (disk_secno == -1) return 0;
if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+ n = file_secno - hpfs_inode->i_file_sec;
+ if (n < hpfs_inode->i_n_secs) {
+ *n_secs = hpfs_inode->i_n_secs - n;
+ return hpfs_inode->i_disk_sec + n;
+ }
+ *n_secs = 1;
return disk_secno;
}
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
{
int r;
secno s;
+ unsigned n_secs;
hpfs_lock(inode->i_sb);
- s = hpfs_bmap(inode, iblock);
+ s = hpfs_bmap(inode, iblock, &n_secs);
if (s) {
+ if (bh_result->b_size >> 9 < n_secs)
+ n_secs = bh_result->b_size >> 9;
map_bh(bh_result, inode->i_sb, s);
+ bh_result->b_size = n_secs << 9;
goto ret_0;
}
if (!create) goto ret_0;
@@ -95,14 +109,26 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
return r;
}
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, hpfs_get_block);
+}
+
static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
{
- return block_write_full_page(page,hpfs_get_block, wbc);
+ return block_write_full_page(page, hpfs_get_block, wbc);
}
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+}
+
+static int hpfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_read_full_page(page,hpfs_get_block);
+ return mpage_writepages(mapping, wbc, hpfs_get_block);
}
static void hpfs_write_failed(struct address_space *mapping, loff_t to)
@@ -161,6 +187,8 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations hpfs_aops = {
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
+ .readpages = hpfs_readpages,
+ .writepages = hpfs_writepages,
.write_begin = hpfs_write_begin,
.write_end = hpfs_write_end,
.bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b5..1b398636e990 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
#define ALLOC_FWD_MAX 128
#define ALLOC_M 1
#define FNODE_RD_AHEAD 16
-#define ANODE_RD_AHEAD 16
-#define DNODE_RD_AHEAD 4
+#define ANODE_RD_AHEAD 0
+#define DNODE_RD_AHEAD 72
+#define COUNT_RD_AHEAD 62
#define FREE_DNODES_ADD 58
#define FREE_DNODES_DEL 29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
/* buffer.c */
+void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+void hpfs_prefetch_bitmap(struct super_block *, unsigned);
unsigned char *hpfs_load_code_page(struct super_block *, secno);
__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d78359..3aa66ae1031e 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
struct quad_buffer_head *qbh, char *id)
{
secno sec;
- if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+ __le32 *ret;
+ unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+ if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
return NULL;
}
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
return NULL;
}
- return hpfs_map_4sectors(s, sec, qbh, 4);
+ ret = hpfs_map_4sectors(s, sec, qbh, 4);
+ if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
+ return ret;
+}
+
+void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
+{
+ unsigned to_prefetch, next_prefetch;
+ unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+ if (unlikely(bmp_block >= n_bands))
+ return;
+ to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+ if (unlikely(bmp_block + 1 >= n_bands))
+ next_prefetch = 0;
+ else
+ next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
+ hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
}
/*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e706957..4334cda8dba1 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
unsigned long *bits;
unsigned count;
- bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+ bits = hpfs_map_4sectors(s, secno, &qbh, 0);
if (!bits)
return 0;
count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
unsigned n, count, n_bands;
n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
count = 0;
- for (n = 0; n < n_bands; n++)
+ for (n = 0; n < COUNT_RD_AHEAD; n++) {
+ hpfs_prefetch_bitmap(s, n);
+ }
+ for (n = 0; n < n_bands; n++) {
+ hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ }
return count;
}
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
sbi->sb_max_fwd_alloc = 0xffffff;
-
+
+ if (sbi->sb_fs_size >= 0x80000000) {
+ hpfs_error(s, "invalid size in superblock: %08x",
+ (unsigned)sbi->sb_fs_size);
+ goto bail4;
+ }
+
/* Load bitmap directory */
if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
goto bail4;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index fc90ab11c340..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
struct dentry *parent;
char *root, *name;
const char *seg_name;
- int len, seg_len;
+ int len, seg_len, root_len;
len = 0;
parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
}
root = "proc";
- len += strlen(root);
+ root_len = strlen(root);
+ len += root_len;
name = kmalloc(len + extra + 1, GFP_KERNEL);
if (name == NULL)
return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
while (parent->d_parent != parent) {
if (is_pid(parent)) {
seg_name = "pid";
- seg_len = strlen("pid");
+ seg_len = strlen(seg_name);
}
else {
seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
len -= seg_len + 1;
name[len] = '/';
- strncpy(&name[len + 1], seg_name, seg_len);
+ memcpy(&name[len + 1], seg_name, seg_len);
parent = parent->d_parent;
}
- strncpy(name, root, strlen(root));
+ memcpy(name, root, root_len);
return name;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index c348d6d88624..e5d408a7ea4a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,8 +117,8 @@ static void destroy_inodecache(void)
static int isofs_remount(struct super_block *sb, int *flags, char *data)
{
- /* we probably want a lot more here */
- *flags |= MS_RDONLY;
+ if (!(*flags & MS_RDONLY))
+ return -EROFS;
return 0;
}
@@ -763,15 +763,6 @@ root_found:
*/
s->s_maxbytes = 0x80000000000LL;
- /*
- * The CDROM is read-only, has no nodes (devices) on it, and since
- * all of the files appear to be owned by root, we really do not want
- * to allow suid. (suid or devices will not show up unless we have
- * Rock Ridge extensions)
- */
-
- s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
-
/* Set this for reference. Its not currently used except on write
which we don't have .. */
@@ -1530,6 +1521,9 @@ struct inode *isofs_iget(struct super_block *sb,
static struct dentry *isofs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
+ /* We don't support read-write mounts */
+ if (!(flags & MS_RDONLY))
+ return ERR_PTR(-EACCES);
return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
* void journal_invalidatepage() - invalidate a journal page
* @journal: journal to use for flush
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: offset of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh,
- offset > 0);
+ partial_page);
unlock_buffer(bh);
}
curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
+ depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+ "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd_space_needed(journal);
- while (__jbd2_log_space_left(journal) < nblocks) {
+ nblocks = jbd2_space_needed(journal);
+ while (jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
*/
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __jbd2_log_space_left(journal);
+ nblocks = jbd2_space_needed(journal);
+ space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/* We were able to recover space; yay! */
;
} else if (tid) {
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set. So we need to temporarily drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
+ write_lock(&journal->j_state_lock);
+ continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
ret = 1;
out:
return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
#include <trace/events/jbd2.h>
/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ struct buffer_head *orig_bh = bh->b_private;
+
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
+ if (orig_bh) {
+ clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&orig_bh->b_state, BH_Shadow);
+ }
unlock_buffer(bh);
}
@@ -85,8 +92,7 @@ nope:
__brelse(bh);
}
-static void jbd2_commit_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
{
struct commit_header *h;
__u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+ h = (struct commit_header *)(bh->b_data);
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head **cbh,
__u32 crc32_sum)
{
- struct journal_head *descriptor;
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
- if (!descriptor)
+ bh = jbd2_journal_get_descriptor_buffer(journal);
+ if (!bh)
return 1;
- bh = jh2bh(descriptor);
-
tmp = (struct commit_header *)bh->b_data;
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- jbd2_commit_block_csum_set(journal, descriptor);
+ jbd2_commit_block_csum_set(journal, bh);
- JBUFFER_TRACE(descriptor, "submit commit block");
+ BUFFER_TRACE(bh, "submit commit block");
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
put_bh(bh); /* One for getblk() */
- jbd2_journal_put_journal_head(bh2jh(bh));
return ret;
}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
}
static void jbd2_descr_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+ struct buffer_head *bh)
{
struct jbd2_journal_block_tail *tail;
__u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_block_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
{
struct page *page = bh->b_page;
__u8 *addr;
- __u32 csum;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
sequence = cpu_to_be32(sequence);
addr = kmap_atomic(page);
- csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
- bh->b_size);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+ bh->b_size);
kunmap_atomic(addr);
- tag->t_checksum = cpu_to_be32(csum);
+ /* We only have space to store the lower 16 bits of the crc32c. */
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
{
struct transaction_stats_s stats;
transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *jh;
+ struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tid_t first_tid;
int update_tail;
int csum_size = 0;
+ LIST_HEAD(io_bufs);
+ LIST_HEAD(log_bufs);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ /*
+ * Reserved credits cannot be claimed anymore, free them
+ */
+ atomic_sub(atomic_read(&journal->j_reserved_credits),
+ &commit_transaction->t_outstanding_credits);
+
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
- WRITE_SYNC);
+ &log_bufs, WRITE_SYNC);
blk_finish_plug(&plug);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_read(&commit_transaction->t_outstanding_credits));
err = 0;
- descriptor = NULL;
bufs = 0;
+ descriptor = NULL;
blk_start_plug(&plug);
while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- bh = jh2bh(descriptor);
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
+ (unsigned long long)descriptor->b_blocknr,
+ descriptor->b_data);
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
+ tagp = &descriptor->b_data[sizeof(journal_header_t)];
+ space_left = descriptor->b_size -
+ sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
+ set_buffer_jwrite(descriptor);
+ set_buffer_dirty(descriptor);
+ wbuf[bufs++] = descriptor;
/* Record it so that we can wait for IO
completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- jbd2_journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+ jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ rid of the shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
- * akpm: jbd2_journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
+ * Make a temporary IO buffer with which to write it out
+ * (this will requeue the metadata buffer to BJ_Shadow).
*/
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
+ jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
continue;
}
- set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
- wbuf[bufs++] = jh2bh(new_jh);
+ jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
- jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+ jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
+ bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
+ the io_bufs list.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
jbd_debug(3, "JBD2: commit phase 3\n");
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
+ while (!list_empty(&io_bufs)) {
+ struct buffer_head *bh = list_entry(io_bufs.prev,
+ struct buffer_head,
+ b_assoc_buffers);
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_unfile_log_bh(bh);
/*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by jbd2_journal_write_metadata_buffer().
+ * The list contains temporary buffer heads created by
+ * jbd2_journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
- jbd2_journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
- /* We also have to unlock and free the corresponding
- shadowed buffer */
+ /* We also have to refile the corresponding shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
- clear_bit(BH_JWrite, &bh->b_state);
+ clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
+ J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this IO to
- * complete. The barrier must be here so that changes by
- * jbd2_journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
@@ -883,26 +862,19 @@ wait_for_iobuf:
jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
+ while (!list_empty(&log_bufs)) {
struct buffer_head *bh;
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
+ bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
- jbd2_journal_unfile_buffer(journal, jh);
- jbd2_journal_put_journal_head(jh);
+ jbd2_unfile_log_bh(bh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -952,9 +924,7 @@ wait_for_iobuf:
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > jbd2_journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
/* Checksumming functions */
int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
*
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ *
*
* Return value:
* <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned long long blocknr)
+ struct buffer_head **bh_out,
+ sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+ jbd_lock_bh_state(bh_in);
+repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
- jbd_lock_bh_state(bh_in);
-repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
- jbd2_journal_put_journal_head(new_jh);
+ brelse(new_bh);
return -ENOMEM;
}
jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
jh_in->b_frozen_data = tmp;
mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+ memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
}
set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_size = bh_in->b_size;
+ new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
+ new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
- *jh_out = new_jh;
+ *bh_out = new_bh;
/*
* The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
+ set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in);
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
return do_escape | (done_copy_out << 1);
}
@@ -484,35 +496,6 @@ repeat:
*/
/*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- /* assert_spin_locked(&journal->j_state_lock); */
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
}
/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions. We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ * 0 if nothing to commit,
+ * 1 if transaction was successfully committed.
*/
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
- int need_to_start = 0;
+ int need_to_start = 0, ret = 0;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
transaction = journal->j_committing_transaction;
if (!transaction) {
+ /* Nothing to commit */
read_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
+ return 0;
}
-
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- return 1;
+ ret = jbd2_log_wait_commit(journal, tid);
+ if (!ret)
+ ret = 1;
+
+ return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ int ret;
+
+ ret = __jbd2_journal_force_commit(journal);
+ return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ int ret;
+
+ J_ASSERT(!current->journal_info);
+ ret = __jbd2_journal_force_commit(journal);
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
- return jbd2_journal_add_journal_head(bh);
+ return bh;
}
/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
return NULL;
init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_reserved);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
+ atomic_set(&journal->j_reserved_credits, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
static void jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
+ journal_superblock_t *sb = journal->j_superblock;
int ret;
trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ jbd2_superblock_csum_set(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
journal->j_errno);
sb->s_errno = cpu_to_be32(journal->j_errno);
- jbd2_superblock_csum_set(journal, sb);
read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD2_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
while (!ret) {
yield();
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
- __u32 provided, calculated;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
sequence = cpu_to_be32(sequence);
- calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
- provided = be32_to_cpu(tag->t_checksum);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return provided == cpu_to_be32(calculated);
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
+ struct list_head *,
+ struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
+ struct list_head *log_bufs,
int write_op)
{
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction,
+ write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
- struct journal_head **descriptorp,
+ struct list_head *log_bufs,
+ struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
int csum_size = 0;
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
int offset;
journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "file in log_bufs");
+ jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
- * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_revoke_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->r_checksum = cpu_to_be32(csum);
}
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
*/
static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
+ struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
- put_bh(bh);
+ put_bh(descriptor);
return;
}
- header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
jbd2_revoke_csum_set(journal, descriptor);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+ write_dirty_buffer(descriptor, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
- atomic_set(&transaction->t_outstanding_credits, 0);
+ atomic_set(&transaction->t_outstanding_credits,
+ atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+ int need_to_start;
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ need_to_start = !tid_geq(journal->j_commit_request, tid);
+ read_unlock(&journal->j_state_lock);
+ if (need_to_start)
+ jbd2_log_start_commit(journal, tid);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+ atomic_sub(blocks, &journal->j_reserved_credits);
+ wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction. Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+ int rsv_blocks)
+{
+ transaction_t *t = journal->j_running_transaction;
+ int needed;
+ int total = blocks + rsv_blocks;
+
+ /*
+ * If the current transaction is locked down for commit, wait
+ * for the lock to be released.
+ */
+ if (t->t_state == T_LOCKED) {
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log space.
+ */
+ needed = atomic_add_return(total, &t->t_outstanding_credits);
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large,
+ * then start to commit it: we can then go back and
+ * attach this handle to a new transaction.
+ */
+ atomic_sub(total, &t->t_outstanding_credits);
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ */
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ __jbd2_log_wait_for_space(journal);
+ write_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* No reservation? We are done... */
+ if (!rsv_blocks)
+ return 0;
+
+ needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+ /* We allow at most half of a transaction to be reserved */
+ if (needed > journal->j_max_transaction_buffers / 2) {
+ sub_reserved_credits(journal, rsv_blocks);
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + rsv_blocks
+ <= journal->j_max_transaction_buffers / 2);
+ return 1;
+ }
+ return 0;
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- tid_t tid;
- int needed, need_to_start;
- int nblocks = handle->h_buffer_credits;
+ int blocks = handle->h_buffer_credits;
+ int rsv_blocks = 0;
unsigned long ts = jiffies;
- if (nblocks > journal->j_max_transaction_buffers) {
+ /*
+ * 1/2 of transaction can be reserved so we can practically handle
+ * only 1/2 of maximum transaction size per operation
+ */
+ if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
+ current->comm, blocks,
+ journal->j_max_transaction_buffers / 2);
return -ENOSPC;
}
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
return -EROFS;
}
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
+ /*
+ * Wait on the journal's transaction barrier if necessary. Specifically
+ * we allow reserved handles to proceed because otherwise commit could
+ * deadlock on page writeback not being able to complete.
+ */
+ if (!handle->h_reserved && journal->j_barrier_count) {
read_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
goto alloc_transaction;
write_lock(&journal->j_state_lock);
if (!journal->j_running_transaction &&
- !journal->j_barrier_count) {
+ (handle->h_reserved || !journal->j_barrier_count)) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -223,85 +340,18 @@ repeat:
transaction = journal->j_running_transaction;
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- read_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- needed = atomic_add_return(nblocks,
- &transaction->t_outstanding_credits);
-
- if (needed > journal->j_max_transaction_buffers) {
+ if (!handle->h_reserved) {
+ /* We may have dropped j_state_lock - restart in that case */
+ if (add_transaction_credits(journal, blocks, rsv_blocks))
+ goto repeat;
+ } else {
/*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
+ * We have handle reserved so we are allowed to join T_LOCKED
+ * transaction and we don't have to check for transaction size
+ * and journal space.
*/
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- tid = transaction->t_tid;
- need_to_start = !tid_geq(journal->j_commit_request, tid);
- read_unlock(&journal->j_state_lock);
- if (need_to_start)
- jbd2_log_start_commit(journal, tid);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * jbd2_journal_extend().
- */
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- read_unlock(&journal->j_state_lock);
- write_lock(&journal->j_state_lock);
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
- __jbd2_log_wait_for_space(journal);
- write_unlock(&journal->j_state_lock);
- goto repeat;
+ sub_reserved_credits(journal, blocks);
+ handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
- handle->h_requested_credits = nblocks;
+ handle->h_requested_credits = blocks;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks,
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
- __jbd2_log_space_left(journal));
+ jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
+ current->journal_info = handle;
lock_map_acquire(&handle->h_lockdep_map);
jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
*
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
- unsigned int type, unsigned int line_no)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+ gfp_t gfp_mask, unsigned int type,
+ unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
+ if (rsv_blocks) {
+ handle_t *rsv_handle;
- current->journal_info = handle;
+ rsv_handle = new_handle(rsv_blocks);
+ if (!rsv_handle) {
+ jbd2_free_handle(handle);
+ return ERR_PTR(-ENOMEM);
+ }
+ rsv_handle->h_reserved = 1;
+ rsv_handle->h_journal = journal;
+ handle->h_rsv_handle = rsv_handle;
+ }
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
- current->journal_info = NULL;
return ERR_PTR(err);
}
handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_buffer_credits);
+ jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+ unsigned int line_no)
+{
+ journal_t *journal = handle->h_journal;
+ int ret = -EIO;
+
+ if (WARN_ON(!handle->h_reserved)) {
+ /* Someone passed in normal handle? Just stop it. */
+ jbd2_journal_stop(handle);
+ return ret;
+ }
+ /*
+ * Usefulness of mixing of reserved and unreserved handles is
+ * questionable. So far nobody seems to need it so just error out.
+ */
+ if (WARN_ON(current->journal_info)) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+
+ handle->h_journal = NULL;
+ /*
+ * GFP_NOFS is here because callers are likely from writeback or
+ * similarly constrained call sites
+ */
+ ret = start_this_handle(journal, handle, GFP_NOFS);
+ if (ret < 0)
+ jbd2_journal_free_reserved(handle);
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
* int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
int jbd2_journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
int result;
int wanted;
- result = -EIO;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
result = 1;
read_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
+ if (transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
- wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+ wanted = atomic_add_return(nblocks,
+ &transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
- if (wanted > __jbd2_log_space_left(journal)) {
+ if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+ jbd2_log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
handle->h_buffer_credits,
nblocks);
handle->h_buffer_credits += nblocks;
handle->h_requested_credits += nblocks;
- atomic_add(nblocks, &transaction->t_outstanding_credits);
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
-out:
return result;
}
@@ -490,19 +615,22 @@ out:
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
*/
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
tid_t tid;
int need_to_start, ret;
+ WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
+ journal = transaction->t_journal;
/*
* First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
spin_lock(&transaction->t_handle_lock);
atomic_sub(handle->h_buffer_credits,
&transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle) {
+ sub_reserved_credits(journal,
+ handle->h_rsv_handle->h_buffer_credits);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
+ tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
+ handle->h_transaction = NULL;
+ current->journal_info = NULL;
jbd_debug(2, "restarting handle %p\n", handle);
- tid = transaction->t_tid;
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
+ /* Wait until there are no reserved handles */
+ if (atomic_read(&journal->j_reserved_credits)) {
+ write_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) == 0);
+ write_lock(&journal->j_state_lock);
+ }
+
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+static int sleep_on_shadow_bh(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
- transaction_t *transaction;
+ transaction_t *transaction = handle->h_transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
-
- transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+ if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
+ wait_on_bit(&bh->b_state, BH_Shadow,
+ sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
goto repeat;
}
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
+ /*
+ * Only do the copy if the currently-owning transaction still
+ * needs it. If buffer isn't on BJ_Metadata list, the
+ * committing transaction is past that stage (here we use the
+ * fact that BH_Shadow is set under bh_state lock together with
+ * refiling to BJ_Shadow list and at this point we know the
+ * buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
+ * in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
+ WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
+ journal = transaction->t_journal;
err = 0;
JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int ret = 0;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
jh = jbd2_journal_grab_journal_head(bh);
if (!jh) {
ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ WARN_ON(!transaction);
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ journal = transaction->t_journal;
+
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
*/
jh->b_modified = 0;
- if (jh->b_transaction == handle->h_transaction) {
+ if (jh->b_transaction == transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err, wait_for_commit = 0;
+ journal_t *journal;
+ int err = 0, wait_for_commit = 0;
tid_t tid;
pid_t pid;
+ if (!transaction)
+ goto free_and_exit;
+ journal = transaction->t_journal;
+
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else {
+ else
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- err = 0;
- }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
lock_map_release(&handle->h_lockdep_map);
+ if (handle->h_rsv_handle)
+ jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
jbd2_free_handle(handle);
return err;
}
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = jbd2_journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = jbd2_journal_stop(handle);
- }
- return ret;
-}
-
/*
*
* List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
- * of these pointers, it could go bad. Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list. If the caller is holding onto a copy of one of these
+ * pointers, it could go bad. Generally the caller needs to re-read the
+ * pointer from the transaction_t.
*
* Called under j_list_lock.
*/
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
* void jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
*/
int jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return 0;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- ret = journal_unmap_buffer(journal, bh, offset > 0);
+ ret = journal_unmap_buffer(journal, bh, partial_page);
unlock_buffer(bh);
if (ret < 0)
return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- return -EIO;
+ return -EROFS;
+ journal = transaction->t_journal;
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ip->i_sb,
- "dbFree: block to be freed is outside the map");
+ jfs_error(ip->i_sb, "block to be freed is outside the map\n");
return -EIO;
}
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
/* free the blocks. */
if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
- jfs_error(ip->i_sb, "dbFree: error in block map\n");
+ jfs_error(ip->i_sb, "error in block map\n");
release_metapage(mp);
IREAD_UNLOCK(ipbmap);
return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
(unsigned long long) blkno,
(unsigned long long) nblocks);
- jfs_error(ipbmap->i_sb,
- "dbUpdatePMap: blocks are outside the map");
+ jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
return -EIO;
}
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
/* the hint should be within the map */
if (hint >= mapSize) {
- jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+ jfs_error(ip->i_sb, "the hint is outside the map\n");
return -EIO;
}
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
bmp = sbi->bmap;
if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
IREAD_UNLOCK(ipbmap);
- jfs_error(ip->i_sb,
- "dbExtend: the block is outside the filesystem");
+ jfs_error(ip->i_sb, "the block is outside the filesystem\n");
return -EIO;
}
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
u32 mask;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNext: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
s8 *leaf;
if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocNear: Corrupt dmap page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
return -EIO;
}
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
*/
if (l2nb > bmp->db_agl2size) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: allocation request is larger than the "
- "allocation group size");
+ "allocation request is larger than the allocation group size\n");
return -EIO;
}
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
(unsigned long long) blkno,
(unsigned long long) nblocks);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: dbAllocCtl failed in free AG");
+ "dbAllocCtl failed in free AG\n");
}
return (rc);
}
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
budmin = dcp->budmin;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
}
if (n == 4) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: failed descending stree");
+ "failed descending stree\n");
release_metapage(mp);
return -EIO;
}
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
&blkno))) {
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: control page "
- "inconsistent");
+ "control page inconsistent\n");
return -EIO;
}
return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAG: unable to allocate blocks");
+ "unable to allocate blocks\n");
rc = -EIO;
}
return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
*/
rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
if (rc == -ENOSPC) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocAny: unable to allocate blocks");
+ jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
return -EIO;
}
return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
if (totrim == NULL) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: no memory for trim array");
+ jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
IWRITE_UNLOCK(ipbmap);
return 0;
}
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
nblocks = 1 << l2nb;
} else {
/* Trim any already allocated blocks */
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbDiscardAG: -EIO");
+ jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
break;
}
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: Corrupt dmapctl page");
+ "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
if (rc) {
if (lev != level) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbFindCtl: dmap inconsistent");
+ "dmap inconsistent\n");
return -EIO;
}
return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
if (dp->tree.stree[ROOT] != L2BPERDMAP) {
release_metapage(mp);
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: the dmap is not all free");
+ "the dmap is not all free\n");
rc = -EIO;
goto backout;
}
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: I/O Error: Block Leakage.");
+ "I/O Error: Block Leakage\n");
continue;
}
dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
* to indicate that we have leaked blocks.
*/
release_metapage(mp);
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocCtl: Block Leakage.");
+ jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
continue;
}
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
for (; nwords > 0; nwords -= nw) {
if (leaf[word] < BUDMIN) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAllocBits: leaf page "
- "corrupt");
+ "leaf page corrupt\n");
break;
}
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
dcp = (struct dmapctl *) mp->data;
if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
- jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: Corrupt dmapctl page");
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
release_metapage(mp);
return -EIO;
}
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
assert(level == bmp->db_maxlevel);
if (bmp->db_maxfreebud != oldroot) {
jfs_error(bmp->db_ipbmap->i_sb,
- "dbAdjCtl: the maximum free buddy is "
- "not the old root");
+ "the maximum free buddy is not the old root\n");
}
bmp->db_maxfreebud = dcp->stree[ROOT];
}
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
p = BMAPBLKNO + nbperpage; /* L2 page */
l2mp = read_metapage(ipbmap, p, PSIZE, 0);
if (!l2mp) {
- jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+ jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
return -EIO;
}
l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
}
} /* for each L1 in a L2 */
- jfs_error(ipbmap->i_sb,
- "dbExtendFS: function has not returned as expected");
+ jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
errout:
if (l0mp)
release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
}
if (bmp->db_agpref >= bmp->db_numag) {
jfs_error(ipbmap->i_sb,
- "cannot find ag with average freespace");
+ "cannot find ag with average freespace\n");
}
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9f4ed13d9f15..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
/* get page buffer for specified block address */
-#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
- if (!(RC))\
- {\
- if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
- {\
- BT_PUTPAGE(MP);\
- jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
+ if (!(RC)) { \
+ if (((P)->header.nextindex > \
+ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
+ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ BT_PUTPAGE(MP); \
+ jfs_error((IP)->i_sb, \
+ "DT_GETPAGE: dtree page corrupt\n"); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
/* Something's corrupted, mark filesystem dirty so
* chkdsk will fix it.
*/
- jfs_error(sb, "stack overrun in dtSearch!");
+ jfs_error(sb, "stack overrun!\n");
BT_STACK_DUMP(btstack);
rc = -EIO;
goto out;
@@ -3247,8 +3247,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
/* Sanity Check */
if (d_namleft == 0) {
jfs_error(ip->i_sb,
- "JFS:Dtree error: ino = "
- "%ld, bn=%Ld, index = %d",
+ "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
(long)ip->i_ino,
(long long)bn,
i);
@@ -3368,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
*/
if (BT_STACK_FULL(btstack)) {
DT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+ jfs_error(ip->i_sb, "btstack overrun\n");
BT_STACK_DUMP(btstack);
return -EIO;
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
if ((rc == 0) && xlen) {
if (xlen != nbperpage) {
- jfs_error(ip->i_sb, "extHint: corrupt xtree");
+ jfs_error(ip->i_sb, "corrupt xtree\n");
rc = -EIO;
}
XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
dp += rel_inode;
if (ip->i_ino != le32_to_cpu(dp->di_number)) {
- jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+ jfs_error(ip->i_sb, "i_ino != di_number\n");
rc = -EIO;
} else if (le32_to_cpu(dp->di_nlink) == 0)
rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
if (!addressPXD(&(jfs_ip->ixpxd)) ||
(lengthPXD(&(jfs_ip->ixpxd)) !=
JFS_IP(ipimap)->i_imap->im_nbperiext)) {
- jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+ jfs_error(ip->i_sb, "ixpxd invalid\n");
return -EIO;
}
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
if (iagno >= imap->im_nextiag) {
print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
imap, 32, 0);
- jfs_error(ip->i_sb,
- "diFree: inum = %d, iagno = %d, nextiag = %d",
+ jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
(uint) inum, iagno, imap->im_nextiag);
return -EIO;
}
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
mask = HIGHORDER >> bitno;
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
- jfs_error(ip->i_sb,
- "diFree: wmap shows inode already free");
+ jfs_error(ip->i_sb, "wmap shows inode already free\n");
}
if (!addressPXD(&iagp->inoext[extno])) {
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: invalid inoext");
+ jfs_error(ip->i_sb, "invalid inoext\n");
return -EIO;
}
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
release_metapage(mp);
IREAD_UNLOCK(ipimap);
AG_UNLOCK(imap, agno);
- jfs_error(ip->i_sb, "diFree: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
/*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
* for the inode being freed.
*/
if (iagp->pmap[extno] != 0) {
- jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+ jfs_error(ip->i_sb, "the pmap does not show inode free\n");
}
iagp->wmap[extno] = 0;
PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
release_metapage(mp);
AG_UNLOCK(imap, agno);
jfs_error(ip->i_sb,
- "diAlloc: can't find free bit "
- "in wmap");
+ "can't find free bit in wmap\n");
return -EIO;
}
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
numinos = imap->im_agctl[agno].numinos;
if (numfree > numinos) {
- jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+ jfs_error(ip->i_sb, "numfree > numinos\n");
return -EIO;
}
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (!iagp->nfreeinos) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb,
- "diAllocIno: nfreeinos = 0, but iag on freelist");
+ jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
return -EIO;
}
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
jfs_error(ip->i_sb,
- "diAllocIno: free inode not found in summary map");
+ "free inode not found in summary map\n");
return -EIO;
}
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+ jfs_error(ip->i_sb, "no free extent found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
if (rem >= INOSPEREXT) {
IREAD_UNLOCK(imap->im_ipimap);
release_metapage(mp);
- jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+ jfs_error(ip->i_sb, "free inode not found\n");
return -EIO;
}
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
if ((rc = diIAGRead(imap, iagno, &mp))) {
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+ jfs_error(ip->i_sb, "error reading iag\n");
return rc;
}
iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (sword >= SMAPSZ) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb,
- "diAllocExt: free ext summary map not found");
+ jfs_error(ip->i_sb, "free ext summary map not found\n");
return -EIO;
}
if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
if (rem >= EXTSPERSUM) {
release_metapage(mp);
IREAD_UNLOCK(imap->im_ipimap);
- jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+ jfs_error(ip->i_sb, "free extent not found\n");
return -EIO;
}
extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
if (bmp)
release_metapage(bmp);
- jfs_error(imap->im_ipimap->i_sb,
- "diAllocBit: iag inconsistent");
+ jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
return -EIO;
}
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* better have free extents.
*/
if (!iagp->nfreeexts) {
- jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+ jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
return -EIO;
}
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
}
if (ciagp == NULL) {
jfs_error(imap->im_ipimap->i_sb,
- "diNewExt: ciagp == NULL");
+ "ciagp == NULL\n");
rc = -EIO;
goto error_out;
}
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
IWRITE_UNLOCK(ipimap);
IAGFREE_UNLOCK(imap);
jfs_error(imap->im_ipimap->i_sb,
- "diNewIAG: ipimap->i_size is wrong");
+ "ipimap->i_size is wrong\n");
return -EIO;
}
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
iagno = INOTOIAG(inum);
/* make sure that the iag is contained within the map */
if (iagno >= imap->im_nextiag) {
- jfs_error(ipimap->i_sb,
- "diUpdatePMap: the iag is outside the map");
+ jfs_error(ipimap->i_sb, "the iag is outside the map\n");
return -EIO;
}
/* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
*/
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in wmap!", inum);
+ "inode %ld not marked as allocated in wmap!\n",
+ inum);
}
if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
jfs_error(ipimap->i_sb,
- "diUpdatePMap: inode %ld not marked as "
- "allocated in pmap!", inum);
+ "inode %ld not marked as allocated in pmap!\n",
+ inum);
}
/* update the bitmap for the extent of the freed inode */
iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not allocated in "
- "the working map");
+ "the inode is not allocated in the working map\n");
return -EIO;
}
if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
release_metapage(mp);
jfs_error(ipimap->i_sb,
- "diUpdatePMap: the inode is not free in the "
- "persistent map");
+ "the inode is not free in the persistent map\n");
return -EIO;
}
/* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
iagp = (struct iag *) bp->data;
if (le32_to_cpu(iagp->iagnum) != i) {
release_metapage(bp);
- jfs_error(ipimap->i_sb,
- "diExtendFs: unexpected value of iagnum");
+ jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
return -EIO;
}
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
if (xnuminos != atomic_read(&imap->im_numinos) ||
xnumfree != atomic_read(&imap->im_numfree)) {
- jfs_error(ipimap->i_sb,
- "diExtendFs: numinos or numfree incorrect");
+ jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
return -EIO;
}
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- BUG_ON(offset);
+ BUG_ON(offset || length < PAGE_CACHE_SIZE);
BUG_ON(PageWriteback(page));
@@ -646,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (mp) {
if (mp->logical_size != size) {
jfs_error(inode->i_sb,
- "__get_metapage: mp->logical_size != size");
+ "get_mp->logical_size != size\n");
jfs_err("logical_size = %d, size = %d",
mp->logical_size, size);
dump_stack();
@@ -657,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
if (test_bit(META_discard, &mp->flag)) {
if (!new) {
jfs_error(inode->i_sb,
- "__get_metapage: using a "
- "discarded metapage");
+ "using a discarded metapage\n");
discard_metapage(mp);
goto unlock;
}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
extern int readSuper(struct super_block *, struct buffer_head **);
extern int updateSuper(struct super_block *, uint);
+__printf(2, 3)
extern void jfs_error(struct super_block *, const char *, ...);
extern int jfs_mount(struct super_block *);
extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
* mark filesystem dirty
*/
if (dirty)
- jfs_error(tblk->sb, "txAbort");
+ jfs_error(tblk->sb, "\n");
return;
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
/* get page buffer for specified block address */
/* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
-{\
- BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
- if (!(RC))\
- {\
- if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
- (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
- (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
- {\
- jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
- BT_PUTPAGE(MP);\
- MP = NULL;\
- RC = -EIO;\
- }\
- }\
-}
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
+do { \
+ BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
+ if (!(RC)) { \
+ if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
+ (le16_to_cpu((P)->header.nextindex) > \
+ le16_to_cpu((P)->header.maxentry)) || \
+ (le16_to_cpu((P)->header.maxentry) > \
+ (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
+ jfs_error((IP)->i_sb, \
+ "XT_GETPAGE: xtree page corrupt\n"); \
+ BT_PUTPAGE(MP); \
+ MP = NULL; \
+ RC = -EIO; \
+ } \
+ } \
+} while (0)
/* for consistency */
#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
/* push (bn, index) of the parent page/entry */
if (BT_STACK_FULL(btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+ jfs_error(ip->i_sb, "xtSearch did not find extent\n");
return -EIO;
}
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
xad = &p->xad[index];
if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+ jfs_error(ip->i_sb, "extension is not contiguous\n");
return -EIO;
}
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+ jfs_error(ip->i_sb, "couldn't find extent\n");
return -EIO;
}
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
nextindex = le16_to_cpu(p->header.nextindex);
if (index != nextindex - 1) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTailgate: the entry found is not the last entry");
+ jfs_error(ip->i_sb, "the entry found is not the last entry\n");
return -EIO;
}
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+ jfs_error(ip->i_sb, "Could not find extent\n");
return -EIO;
}
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
(nxoff + nxlen > xoff + xlen)) {
XT_PUTPAGE(mp);
jfs_error(ip->i_sb,
- "xtUpdate: nXAD in not completely contained within XAD");
+ "nXAD in not completely contained within XAD\n");
return -EIO;
}
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (xoff >= nxoff) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+ jfs_error(ip->i_sb, "xoff >= nxoff\n");
return -EIO;
}
/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+ jfs_error(ip->i_sb, "xtSearch failed\n");
return -EIO;
}
if (index0 != index) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtUpdate: unexpected value of index");
+ jfs_error(ip->i_sb, "unexpected value of index\n");
return -EIO;
}
}
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
if (cmp != 0) {
XT_PUTPAGE(mp);
- jfs_error(ip->i_sb,
- "xtTruncate_pmap: did not find extent");
+ jfs_error(ip->i_sb, "did not find extent\n");
return -EIO;
}
} else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
getChild:
/* save current parent entry for the child page */
if (BT_STACK_FULL(&btstack)) {
- jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+ jfs_error(ip->i_sb, "stack overrun!\n");
XT_PUTPAGE(mp);
return -EIO;
}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 8b19027291d6..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!S_ISDIR(old_ip->i_mode) && new_ip)
IWRITE_UNLOCK(new_ip);
jfs_error(new_ip->i_sb,
- "jfs_rename: new_ip->i_nlink != 0");
+ "new_ip->i_nlink != 0\n");
return -EIO;
}
tblk = tid_to_tblock(tid);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto resume;
error_out:
- jfs_error(sb, "jfs_extendfs");
+ jfs_error(sb, "\n");
resume:
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
/* nothing is done for continue beyond marking the superblock dirty */
}
-void jfs_error(struct super_block *sb, const char * function, ...)
+void jfs_error(struct super_block *sb, const char *fmt, ...)
{
- static char error_buf[256];
+ struct va_format vaf;
va_list args;
- va_start(args, function);
- vsnprintf(error_buf, sizeof(error_buf), function, args);
- va_end(args);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
- pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
+ pr_err("ERROR: (device %s): %pf: %pV\n",
+ sb->s_id, __builtin_return_address(0), &vaf);
+
+ va_end(args);
jfs_handle_error(sb);
}
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
txQuiesce(sb);
rc = lmLogShutdown(log);
if (rc) {
- jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
+ jfs_error(sb, "lmLogShutdown failed\n");
/* let operations fail rather than hang */
txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
rc = updateSuper(sb, FM_MOUNT);
if (rc) {
- jfs_error(sb, "jfs_unfreeze: updateSuper failed");
+ jfs_error(sb, "updateSuper failed\n");
goto out;
}
rc = lmLogInit(log);
if (rc)
- jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
+ jfs_error(sb, "lmLogInit failed\n");
out:
txResume(sb);
}
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
nbytes = sizeDXD(&ji->ea);
if (!nbytes) {
- jfs_error(sb, "ea_read: nbytes is 0");
+ jfs_error(sb, "nbytes is 0\n");
return -EIO;
}
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
current_blocks = 0;
} else {
if (!(ji->ea.flag & DXD_EXTENT)) {
- jfs_error(sb, "ea_get: invalid ea.flag)");
+ jfs_error(sb, "invalid ea.flag\n");
return -EIO;
}
current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
}
#ifdef CONFIG_JFS_SECURITY
-int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index c3a0837fb861..3a3a9b53bf5a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -61,7 +61,8 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- d_set_d_op(dentry, &simple_dentry_operations);
+ if (!dentry->d_sb->s_d_op)
+ d_set_d_op(dentry, &simple_dentry_operations);
d_add(dentry, NULL);
return NULL;
}
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 01bfe7662751..41e491b8e5d7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
nlm_init->protocol, nlm_version,
nlm_init->hostname, nlm_init->noresvport,
nlm_init->net);
- if (host == NULL) {
- lockd_down(nlm_init->net);
- return ERR_PTR(-ENOLCK);
- }
+ if (host == NULL)
+ goto out_nohost;
+ if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
+ goto out_nobind;
return host;
+out_nobind:
+ nlmclnt_release_host(host);
+out_nohost:
+ lockd_down(nlm_init->net);
+ return ERR_PTR(-ENOLCK);
}
EXPORT_SYMBOL_GPL(nlmclnt_init);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 9760ecb9b60f..acd394716349 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
{
struct nlm_args *argp = &req->a_args;
struct nlm_lock *lock = &argp->lock;
+ char *nodename = req->a_host->h_rpcclnt->cl_nodename;
nlmclnt_next_cookie(&argp->cookie);
memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
- lock->caller = utsname()->nodename;
+ lock->caller = nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
(unsigned int)fl->fl_u.nfs_fl.owner->pid,
- utsname()->nodename);
+ nodename);
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+ nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 067778b0ccc9..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -951,6 +951,7 @@ nlmsvc_retry_blocked(void)
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
+ spin_lock(&nlm_blocked_lock);
while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
@@ -960,6 +961,7 @@ nlmsvc_retry_blocked(void)
timeout = block->b_when - jiffies;
break;
}
+ spin_unlock(&nlm_blocked_lock);
dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
block, block->b_when);
@@ -969,7 +971,9 @@ nlmsvc_retry_blocked(void)
retry_deferred_block(block);
} else
nlmsvc_grant_blocked(block);
+ spin_lock(&nlm_blocked_lock);
}
+ spin_unlock(&nlm_blocked_lock);
return timeout;
}
diff --git a/fs/locks.c b/fs/locks.c
index 04e2c1fdb157..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,6 +127,8 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
#include <asm/uaccess.h>
@@ -155,11 +157,13 @@ int lease_break_time = 45;
for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
/*
- * The global file_lock_list is only used for displaying /proc/locks. Protected
- * by the file_lock_lock.
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
*/
-static HLIST_HEAD(file_lock_list);
-static DEFINE_SPINLOCK(file_lock_lock);
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -506,20 +510,30 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner;
}
+/* Must be called with the i_lock held! */
static inline void
locks_insert_global_locks(struct file_lock *fl)
{
- spin_lock(&file_lock_lock);
- hlist_add_head(&fl->fl_link, &file_lock_list);
- spin_unlock(&file_lock_lock);
+ lg_local_lock(&file_lock_lglock);
+ fl->fl_link_cpu = smp_processor_id();
+ hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+ lg_local_unlock(&file_lock_lglock);
}
+/* Must be called with the i_lock held! */
static inline void
locks_delete_global_locks(struct file_lock *fl)
{
- spin_lock(&file_lock_lock);
+ /*
+ * Avoid taking lock if already unhashed. This is safe since this check
+ * is done while holding the i_lock, and new insertions into the list
+ * also require that it be held.
+ */
+ if (hlist_unhashed(&fl->fl_link))
+ return;
+ lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
hlist_del_init(&fl->fl_link);
- spin_unlock(&file_lock_lock);
+ lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
}
static unsigned long
@@ -1454,7 +1468,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
if ((arg == F_WRLCK)
- && ((dentry->d_count > 1)
+ && ((d_count(dentry) > 1)
|| (atomic_read(&inode->i_count) > 1)))
goto out;
@@ -2243,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+struct locks_iterator {
+ int li_cpu;
+ loff_t li_pos;
+};
+
static void lock_get_status(struct seq_file *f, struct file_lock *fl,
loff_t id, char *pfx)
{
@@ -2316,39 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
static int locks_show(struct seq_file *f, void *v)
{
+ struct locks_iterator *iter = f->private;
struct file_lock *fl, *bfl;
fl = hlist_entry(v, struct file_lock, fl_link);
- lock_get_status(f, fl, *((loff_t *)f->private), "");
+ lock_get_status(f, fl, iter->li_pos, "");
list_for_each_entry(bfl, &fl->fl_block, fl_block)
- lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+ lock_get_status(f, bfl, iter->li_pos, " ->");
return 0;
}
static void *locks_start(struct seq_file *f, loff_t *pos)
{
- loff_t *p = f->private;
+ struct locks_iterator *iter = f->private;
- spin_lock(&file_lock_lock);
+ iter->li_pos = *pos + 1;
+ lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
- *p = (*pos + 1);
- return seq_hlist_start(&file_lock_list, *pos);
+ return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
- loff_t *p = f->private;
- ++*p;
- return seq_hlist_next(v, &file_lock_list, pos);
+ struct locks_iterator *iter = f->private;
+
+ ++iter->li_pos;
+ return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
{
spin_unlock(&blocked_lock_lock);
- spin_unlock(&file_lock_lock);
+ lg_global_unlock(&file_lock_lglock);
}
static const struct seq_operations locks_seq_operations = {
@@ -2360,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
static int locks_open(struct inode *inode, struct file *filp)
{
- return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+ return seq_open_private(filp, &locks_seq_operations,
+ sizeof(struct locks_iterator));
}
static const struct file_operations proc_locks_operations = {
@@ -2460,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
static int __init filelock_init(void)
{
+ int i;
+
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+ for_each_possible_cpu(i)
+ INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
return 0;
}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 9c501449450d..427bb73e298f 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -245,8 +245,8 @@ static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
goto out;
if (memchr_inv(buf, 0xff, super->s_writesize))
err = -EIO;
- kfree(buf);
out:
+ kfree(buf);
return err;
}
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
}
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
return area;
}
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+ unsigned int l)
{
return;
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 54360293bcb5..b256c0690e5b 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -287,14 +287,14 @@ static int logfs_make_writeable(struct super_block *sb)
if (err)
return err;
+ /* Do one GC pass before any data gets dirtied */
+ logfs_gc_pass(sb);
+
/* Check areas for trailing unaccounted data */
err = logfs_check_areas(sb);
if (err)
return err;
- /* Do one GC pass before any data gets dirtied */
- logfs_gc_pass(sb);
-
/* after all initializations are done, replay the journal
* for rw-mounts, if necessary */
err = logfs_replay_journal(sb);
diff --git a/fs/namei.c b/fs/namei.c
index b2beee7a733f..89a612e392eb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2977,7 +2977,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
file->f_flags = op->open_flag;
- if (unlikely(file->f_flags & O_TMPFILE)) {
+ if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
goto out;
}
@@ -3671,15 +3671,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
return -EINVAL;
/*
- * To use null names we require CAP_DAC_READ_SEARCH
- * This ensures that not everyone will be able to create
- * handlink using the passed filedescriptor.
+ * Using empty names is equivalent to using AT_SYMLINK_FOLLOW
+ * on /proc/self/fd/<fd>.
*/
- if (flags & AT_EMPTY_PATH) {
- if (!capable(CAP_DAC_READ_SEARCH))
- return -ENOENT;
+ if (flags & AT_EMPTY_PATH)
how = LOOKUP_EMPTY;
- }
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0765ad12c382..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
switch (optval) {
case 'u':
data->uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->uid))
+ if (!uid_valid(data->uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'g':
data->gid = make_kgid(current_user_ns(), optint);
- if (!gid_valid(data->gid))
+ if (!gid_valid(data->gid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'o':
data->mounted_uid = make_kuid(current_user_ns(), optint);
- if (!uid_valid(data->mounted_uid))
+ if (!uid_valid(data->mounted_uid)) {
+ ret = -EINVAL;
goto err;
+ }
break;
case 'm':
data->file_mode = optint;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
/* we do not support files bigger than 4GB... We eventually
supports just 4GB... */
- if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff
+ if (vma_pages(vma) + vma->vm_pgoff
> (1U << (32 - PAGE_SHIFT)))
return -EFBIG;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
If unsure, say N.
+config NFS_V4_2
+ bool "NFS client support for NFSv4.2"
+ depends on NFS_V4_1
+ help
+ This option enables support for minor version 2 of the NFSv4 protocol
+ in the kernel's NFS client.
+
+ If unsure, say N.
+
config PNFS_FILE_LAYOUT
tristate
depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
If the NFS client is unchanged from the upstream kernel, this
option should be set to the default "kernel.org".
+config NFS_V4_SECURITY_LABEL
+ bool
+ depends on NFS_V4_2 && SECURITY
+ default y
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
direct.o pagelist.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o \
- dns_resolve.o cache_lib.o
+ write.o namespace.o mount_clnt.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
obj-$(CONFIG_NFS_V4) += nfsv4.o
nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
- nfs4namespace.o nfs4getroot.o nfs4client.o
+ nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
dev->pgbase = 0;
dev->pglen = PAGE_SIZE * max_pages;
dev->mincount = 0;
+ dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev);
+ rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc) {
rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
struct svc_rqst *rqstp;
int (*callback_svc)(void *vrqstp);
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- char svc_name[12];
int ret;
nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
svc_sock_update_bufs(serv);
- sprintf(svc_name, "nfsv4.%u-svc", minorversion);
cb_info->serv = serv;
cb_info->rqst = rqstp;
- cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+ cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+ "nfsv4.%u-svc", minorversion);
if (IS_ERR(cb_info->task)) {
ret = PTR_ERR(cb_info->task);
svc_exit_thread(cb_info->rqst);
@@ -282,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
ret = nfs4_callback_up_net(serv, net);
break;
case 1:
+ case 2:
ret = nfs41_callback_up_net(serv, net);
break;
default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
OP_CB_WANTS_CANCELLED = 12,
OP_CB_NOTIFY_LOCK = 13,
OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+ OP_CB_OFFLOAD = 15,
OP_CB_ILLEGAL = 10044,
};
@@ -39,6 +41,7 @@ struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
u32 slotid;
+ u32 minorversion;
struct net *net;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
- clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+ &args->csa_sessionid, cps->minorversion);
if (clp == NULL)
goto out;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
- /* Check minor version is zero or one. */
- if (hdr->minorversion <= 1) {
- hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+ /* Check for minor version support */
+ if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
} else {
pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
"illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
}
#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+ if (status != htonl(NFS4ERR_OP_ILLEGAL))
+ return status;
+
+ if (op_nr == OP_CB_OFFLOAD)
+ return htonl(NFS4ERR_NOTSUPP);
+ return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
static __be32
preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
return htonl(NFS_OK);
}
-static __be32 process_op(uint32_t minorversion, int nop,
- struct svc_rqst *rqstp,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
struct xdr_stream *xdr_out, void *resp,
struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
return status;
dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
- __func__, minorversion, nop, op_nr);
+ __func__, cps->minorversion, nop, op_nr);
+
+ switch (cps->minorversion) {
+ case 0:
+ status = preprocess_nfs4_op(op_nr, &op);
+ break;
+ case 1:
+ status = preprocess_nfs41_op(nop, op_nr, &op);
+ break;
+ case 2:
+ status = preprocess_nfs42_op(nop, op_nr, &op);
+ break;
+ default:
+ status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
- status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
- preprocess_nfs4_op(op_nr, &op);
if (status == htonl(NFS4ERR_OP_ILLEGAL))
op_nr = OP_CB_ILLEGAL;
if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
return rpc_drop_reply;
}
+ cps.minorversion = hdr_arg.minorversion;
hdr_res.taglen = hdr_arg.taglen;
hdr_res.tag = hdr_arg.tag;
if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
return rpc_system_err;
while (status == 0 && nops != hdr_arg.nops) {
- status = process_op(hdr_arg.minorversion, nops, rqstp,
- &xdr_in, argp, &xdr_out, resp, &cps);
+ status = process_op(nops, rqstp, &xdr_in,
+ argp, &xdr_out, resp, &cps);
nops++;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..2dceee4db076 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -501,8 +501,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags;
- return rpc_ops->init_client(new, timeparms, ip_addr,
- authflavour);
+ return rpc_ops->init_client(new, timeparms, ip_addr);
}
spin_unlock(&nn->nfs_client_lock);
@@ -694,13 +693,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* @clp: nfs_client to initialise
* @timeparms: timeout parameters for underlying RPC transport
* @ip_addr: IP presentation address (not used)
- * @authflavor: authentication flavor for underlying RPC transport
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr, rpc_authflavor_t authflavour)
+ const char *ip_addr)
{
int error;
@@ -753,8 +751,6 @@ static int nfs_init_server(struct nfs_server *server,
data->timeo, data->retrans);
if (data->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- if (server->options & NFS_OPTION_MIGRATION)
- set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1072,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
}
if (!(fattr->valid & NFS_ATTR_FATTR)) {
- error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+ error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d051419527b..39e69d47cd0b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
#include <linux/pagevec.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/swap.h>
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
@@ -436,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct dentry *alias;
struct inode *dir = parent->d_inode;
struct inode *inode;
+ int status;
if (filename.name[0] == '.') {
if (filename.len == 1)
@@ -448,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
dentry = d_lookup(parent, &filename);
if (dentry != NULL) {
if (nfs_same_file(dentry, entry)) {
- nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ if (!status)
+ nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
goto out;
} else {
if (d_invalidate(dentry) != 0)
@@ -461,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (dentry == NULL)
return;
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
if (IS_ERR(inode))
goto out;
@@ -586,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
if (entry.fh == NULL || entry.fattr == NULL)
goto out;
+ entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(entry.label)) {
+ status = PTR_ERR(entry.label);
+ goto out;
+ }
+
array = nfs_readdir_get_array(page);
if (IS_ERR(array)) {
status = PTR_ERR(array);
- goto out;
+ goto out_label_free;
}
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
@@ -615,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
nfs_readdir_free_large_page(pages_ptr, pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
+out_label_free:
+ nfs4_label_free(entry.label);
out:
nfs_free_fattr(entry.fattr);
nfs_free_fhandle(entry.fh);
@@ -805,7 +818,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
- int res;
+ int res = 0;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -827,7 +840,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- res = nfs_revalidate_mapping(inode, file->f_mapping);
+ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+ res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1039,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
if (flags & LOOKUP_RCU)
@@ -1081,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (fhandle == NULL || fattr == NULL)
goto out_error;
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out_error;
+
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error)
goto out_bad;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1089,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if ((error = nfs_refresh_inode(inode, fattr)) != 0)
goto out_bad;
+ nfs_setsecurity(inode, fattr, label);
+
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
@@ -1107,6 +1130,7 @@ out_zap_parent:
out_bad:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
nfs_mark_for_revalidate(dir);
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
@@ -1127,6 +1151,7 @@ out_zap_parent:
out_error:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
__func__, dentry->d_parent->d_name.name,
@@ -1255,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
int error;
dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1281,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (fhandle == NULL || fattr == NULL)
goto out;
+ label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out;
+
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
res = ERR_CAST(inode);
if (IS_ERR(res))
goto out_unblock_sillyrename;
@@ -1309,6 +1339,7 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
+ nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
@@ -1356,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
{
int err;
- if (ctx->dentry != dentry) {
- dput(ctx->dentry);
- ctx->dentry = dget(dentry);
- }
-
- /* If the open_intent is for execute, we have an extra check to make */
- if (ctx->mode & FMODE_EXEC) {
- err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
- if (err < 0)
- goto out;
- }
-
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;
@@ -1394,6 +1413,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+ err = nfs_check_flags(open_flags);
+ if (err)
+ return err;
+
/* NFS only supports OPEN on regular files */
if ((open_flags & O_DIRECTORY)) {
if (!d_unhashed(dentry)) {
@@ -1426,13 +1449,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
nfs_block_sillyrename(dentry->d_parent);
inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
- d_drop(dentry);
+ nfs_unblock_sillyrename(dentry->d_parent);
if (IS_ERR(inode)) {
- nfs_unblock_sillyrename(dentry->d_parent);
put_nfs_open_context(ctx);
err = PTR_ERR(inode);
switch (err) {
case -ENOENT:
+ d_drop(dentry);
d_add(dentry, NULL);
break;
case -EISDIR:
@@ -1448,16 +1471,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
- res = d_add_unique(dentry, inode);
- if (res != NULL)
- dentry = res;
-
- nfs_unblock_sillyrename(dentry->d_parent);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
- err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
- dput(res);
+ err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
out:
return err;
@@ -1527,7 +1542,8 @@ no_open:
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct dentry *parent = dget_parent(dentry);
struct inode *dir = parent->d_inode;
@@ -1540,18 +1556,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
if (dentry->d_inode)
goto out;
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
if (error)
goto out_error;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (!(fattr->valid & NFS_ATTR_FATTR)) {
struct nfs_server *server = NFS_SB(dentry->d_sb);
- error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
if (error < 0)
goto out_error;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_error;
@@ -1720,7 +1736,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_ino, dentry->d_name.name);
spin_lock(&dentry->d_lock);
- if (dentry->d_count > 1) {
+ if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
@@ -1758,7 +1774,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
*/
int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- struct pagevec lru_pvec;
struct page *page;
char *kaddr;
struct iattr attr;
@@ -1798,11 +1813,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
*/
- pagevec_init(&lru_pvec, 0);
- if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+ if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
GFP_KERNEL)) {
- pagevec_add(&lru_pvec, page);
- pagevec_lru_add_file(&lru_pvec);
SetPageUptodate(page);
unlock_page(page);
} else
@@ -1869,7 +1881,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
- new_dentry->d_count);
+ d_count(new_dentry));
/*
* For non-directories, check whether the target is busy and if so,
@@ -1887,7 +1899,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rehash = new_dentry;
}
- if (new_dentry->d_count > 2) {
+ if (d_count(new_dentry) > 2) {
int err;
/* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
kfree(ip_addr);
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
#else
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
ret = -ESRCH;
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
static struct cache_detail nfs_dns_resolve_template = {
.owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
cache_destroy_net(nn->nfs_dns_resolve, net);
}
+static int nfs4_dns_net_init(struct net *net)
+{
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+ .init = nfs4_dns_net_init,
+ .exit = nfs4_dns_net_exit,
+};
+
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
void *ptr)
{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
int nfs_dns_resolver_init(void)
{
- return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ int err;
+
+ err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+ if (err < 0)
+ goto out;
+ err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ if (err < 0)
+ goto out1;
+ return 0;
+out1:
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+ return err;
}
void nfs_dns_resolver_destroy(void)
{
rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
}
#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+ page, offset, length);
- if (offset != 0)
+ if (offset != 0 || length < PAGE_CACHE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
return nfs_fscache_release_page(page, gfp);
}
+static void nfs_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct nfs_inode *nfsi;
+ struct address_space *mapping = page_file_mapping(page);
+
+ if (!mapping || PageSwapCache(page))
+ return;
+
+ /*
+ * Check if an unstable page is currently being committed and
+ * if so, have the VM treat it as if the page is under writeback
+ * so it will not block due to pages that will shortly be freeable.
+ */
+ nfsi = NFS_I(mapping->host);
+ if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ *writeback = true;
+ return;
+ }
+
+ /*
+ * If PagePrivate() is set, then the page is not freeable and as the
+ * inode is not being committed, it's not going to be cleaned in the
+ * near future so treat it as dirty
+ */
+ if (PagePrivate(page))
+ *dirty = true;
+}
+
/*
* Attempt to clear the private state associated with a page when an error
* occurs that requires the cached contents of an inode to be written back or
@@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
.direct_IO = nfs_direct_IO,
.migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
+ .is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
#ifdef CONFIG_NFS_SWAP
.swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
goto out;
}
- inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+ inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
return desclen;
}
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
- const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, struct idmap *idmap)
{
- const struct cred *saved_cred;
- struct key *rkey;
char *desc;
- struct user_key_payload *payload;
+ struct key *rkey;
ssize_t ret;
ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
if (ret <= 0)
- goto out;
+ return ERR_PTR(ret);
+
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ if (IS_ERR(rkey)) {
+ mutex_lock(&idmap->idmap_mutex);
+ rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+ desc, "", 0, idmap);
+ mutex_unlock(&idmap->idmap_mutex);
+ }
+
+ kfree(desc);
+ return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ struct user_key_payload *payload;
+ ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
- if (idmap)
- rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
- else
- rkey = request_key(&key_type_id_resolver, desc, "");
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
revert_creds(saved_cred);
- kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
goto out;
@@ -316,23 +329,6 @@ out:
return ret;
}
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
- const char *type, void *data,
- size_t data_size, struct idmap *idmap)
-{
- ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
- name, namelen, type, data,
- data_size, NULL);
- if (ret < 0) {
- mutex_lock(&idmap->idmap_mutex);
- ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
- name, namelen, type, data,
- data_size, idmap);
- mutex_unlock(&idmap->idmap_mutex);
- }
- return ret;
-}
-
/* ID -> Name */
static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c1c7a9d78722..941246f2b43d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
-#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
#include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
{
if (fatal_signal_pending(current))
return -ERESTARTSYS;
- freezable_schedule();
+ freezable_schedule_unsafe();
return 0;
}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
nfs_fscache_invalidate(inode);
- } else {
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
- }
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
+ } else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_LABEL
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_PAGECACHE;
}
void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
return 0;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ int error;
+
+ if (label == NULL)
+ return;
+
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+ return;
+
+ if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+ return;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+ error = security_inode_notifysecctx(inode, label->label,
+ label->len);
+ if (error)
+ printk(KERN_ERR "%s() %s %d "
+ "security_inode_notifysecctx() %d\n",
+ __func__,
+ (char *)label->label,
+ label->len, error);
+ }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+ struct nfs4_label *label = NULL;
+ int minor_version = server->nfs_client->cl_minorversion;
+
+ if (minor_version < 2)
+ return label;
+
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ return label;
+
+ label = kzalloc(sizeof(struct nfs4_label), flags);
+ if (label == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+ if (label->label == NULL) {
+ kfree(label);
+ return ERR_PTR(-ENOMEM);
+ }
+ label->len = NFS4_MAXLABELLEN;
+
+ return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
/*
* This is our front-end to iget that looks up inodes by file handle
* instead of inode number.
*/
struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_find_desc desc = {
.fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
}
+
+ nfs_setsecurity(inode, fattr, label);
+
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
nfsi->access_cache = RB_ROOT;
@@ -449,7 +519,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
NFS_PROTO(inode)->return_delegation(inode);
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
if (error == 0)
- nfs_refresh_inode(inode, fattr);
+ error = nfs_refresh_inode(inode, fattr);
nfs_free_fattr(fattr);
out:
return error;
@@ -713,16 +783,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
*/
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = ctx->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
- filp->private_data = get_nfs_open_context(ctx);
spin_lock(&inode->i_lock);
list_add(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ filp->private_data = get_nfs_open_context(ctx);
+ if (list_empty(&ctx->list))
+ nfs_inode_attach_open_context(ctx);
+}
EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
/*
@@ -748,10 +825,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
static void nfs_file_clear_open_context(struct file *filp)
{
- struct inode *inode = file_inode(filp);
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
+ struct inode *inode = ctx->dentry->d_inode;
+
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +868,7 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
+ struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
@@ -807,7 +886,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label)) {
+ status = PTR_ERR(label);
+ goto out;
+ }
+
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -817,7 +903,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (!S_ISDIR(inode->i_mode))
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
}
- goto out;
+ goto err_out;
}
status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +911,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
- goto out;
+ goto err_out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +921,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id,
(long long)NFS_FILEID(inode));
- out:
+err_out:
+ nfs4_label_free(label);
+out:
nfs_free_fattr(fattr);
return status;
}
@@ -847,7 +935,7 @@ int nfs_attribute_timeout(struct inode *inode)
return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
-static int nfs_attribute_cache_expired(struct inode *inode)
+int nfs_attribute_cache_expired(struct inode *inode)
{
if (nfs_have_delegated_attributes(inode))
return 0;
@@ -863,7 +951,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
*/
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
- if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
&& !nfs_attribute_cache_expired(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
@@ -873,9 +962,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
-
+ int ret;
+
if (mapping->nrpages != 0) {
- int ret = invalidate_inode_pages2(mapping);
+ if (S_ISREG(inode->i_mode)) {
+ ret = nfs_sync_mapping(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ ret = invalidate_inode_pages2(mapping);
if (ret < 0)
return ret;
}
@@ -1243,6 +1338,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
spin_lock(&inode->i_lock);
status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
+
return status;
}
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1579,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
- if (invalid & NFS_INO_INVALID_ATTR) {
+ if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1592,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
}
invalid &= ~NFS_INO_INVALID_ATTR;
+ invalid &= ~NFS_INO_INVALID_LABEL;
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1638,12 +1735,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
nfs_clients_init(net);
- return nfs_dns_resolver_cache_init(net);
+ return 0;
}
static void nfs_net_exit(struct net *net)
{
- nfs_dns_resolver_cache_destroy(net);
nfs_cleanup_cb_ident_idr(net);
}
@@ -1661,10 +1757,6 @@ static int __init init_nfs_fs(void)
{
int err;
- err = nfs_dns_resolver_init();
- if (err < 0)
- goto out10;;
-
err = register_pernet_subsys(&nfs_net_ops);
if (err < 0)
goto out9;
@@ -1730,8 +1822,6 @@ out7:
out8:
unregister_pernet_subsys(&nfs_net_ops);
out9:
- nfs_dns_resolver_destroy();
-out10:
return err;
}
@@ -1744,7 +1834,6 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
- nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..9b694f1e06c5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
- struct nfs4_sessionid *);
+ struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
struct nfs_subversion *);
extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
#ifdef CONFIG_NFS_V4_1
extern const u32 nfs41_maxread_overhead;
extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
#endif
/* nfs4proc.c */
@@ -266,7 +267,7 @@ extern struct rpc_procinfo nfs4_procedures[];
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr, rpc_authflavor_t authflavour);
+ const char *ip_addr);
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -450,8 +451,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
extern void __nfs4_read_done_cb(struct nfs_read_data *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour);
+ const char *ip_addr);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
* nfs_mount - Obtain an NFS file handle for the given host and path
* @info: pointer to mount request arguments
*
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
*/
int nfs_mount(struct nfs_mount_request *info)
{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
dprintk("NFS: MNT request succeeded\n");
status = 0;
+ /*
+ * If the server didn't provide a flavor list, allow the
+ * client to try any flavor.
+ */
+ if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+ dprintk("NFS: Faking up auth_flavs list\n");
+ info->auth_flavs[0] = RPC_AUTH_NULL;
+ *info->auth_flav_len = 1;
+ }
out:
return status;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
struct dentry *parent = dget_parent(dentry);
/* Look it up again to get its attributes */
- err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+ err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
dput(parent);
if (err != 0)
return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX)
break;
- freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs3_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs3_diropargs arg = {
.fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
nfs_post_op_update_inode(dir, data->res.dir_attr);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
return status;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..d7bb59d5dd9c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -193,8 +193,7 @@ struct nfs4_state_recovery_ops {
int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
- struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
- int (*reclaim_complete)(struct nfs_client *);
+ int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
struct rpc_cred *);
};
@@ -303,10 +302,10 @@ is_ds_client(struct nfs_client *clp)
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
void nfs4_free_client(struct nfs_client *);
@@ -319,7 +318,7 @@ extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
/* nfs4state.c */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
int nfs4_discover_server_trunking(struct nfs_client *clp,
@@ -327,7 +326,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
int nfs40_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
#if defined(CONFIG_NFS_V4_1)
-struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..767a5e37fe97 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
if (err)
goto error;
+ if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+ err = -EINVAL;
+ goto error;
+ }
+
spin_lock_init(&clp->cl_lock);
INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -182,8 +187,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
*/
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour)
+ const char *ip_addr)
{
char buf[INET6_ADDRSTRLEN + 1];
struct nfs_client *old;
@@ -562,14 +566,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
*/
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
struct nfs_client *clp;
struct nfs_net *nn = net_generic(net, nfs_net_id);
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
- if (nfs4_cb_match_client(addr, clp, 1) == false)
+ if (nfs4_cb_match_client(addr, clp, minorversion) == false)
continue;
if (!nfs4_has_session(clp))
@@ -592,7 +596,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
struct nfs_client *
nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
- struct nfs4_sessionid *sid)
+ struct nfs4_sessionid *sid, u32 minorversion)
{
return NULL;
}
@@ -626,6 +630,8 @@ static int nfs4_set_client(struct nfs_server *server,
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (server->options & NFS_OPTION_MIGRATION)
+ set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +736,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
return -ENOMEM;
/* We must ensure the session is initialised first */
- error = nfs4_init_session(server);
+ error = nfs4_init_session(server->nfs_client);
if (error < 0)
goto out;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
goto out_drop;
}
}
- iput(inode);
if (inode != dentry->d_inode)
goto out_drop;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
NFS_SERVER(lo->plh_inode)->nfs_client, id);
if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+ dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+ lo->plh_lc_cred, gfp_flags);
if (dsaddr == NULL)
goto out;
} else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
* of available devices, and return it.
*/
struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+ struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred,
+ gfp_t gfp_flags)
{
struct pnfs_device *pdev = NULL;
u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
pdev->pgbase = 0;
pdev->pglen = max_resp_sz;
pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
- rc = nfs4_proc_getdeviceinfo(server, pdev);
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
dprintk("%s getdevice info returns %d\n", __func__, rc);
if (rc)
goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d7ba5616989c..f672c340393d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state);
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+ struct rpc_cred *);
#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ int err;
+
+ if (label == NULL)
+ return NULL;
+
+ if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+ return NULL;
+
+ if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+ return NULL;
+
+ err = security_dentry_init_security(dentry, sattr->ia_mode,
+ &dentry->d_name, (void **)&label->label, &label->len);
+ if (err == 0)
+ return label;
+
+ return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+ if (label)
+ security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+ if (label)
+ return server->attr_bitmask;
+
+ return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
{
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
- | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
| FATTR4_WORD0_FILEID,
};
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
FATTR4_WORD0_FILES_AVAIL
| FATTR4_WORD0_FILES_FREE
| FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
| FATTR4_WORD1_SPACE_TOTAL
};
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
FATTR4_WORD0_MAXLINK
| FATTR4_WORD0_MAXNAME,
0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD2_LAYOUT_BLKSIZE
};
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
FATTR4_WORD0_TYPE
| FATTR4_WORD0_CHANGE
| FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY
- | FATTR4_WORD1_MOUNTED_ON_FILEID
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
};
static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -268,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
*timeout = NFS4_POLL_RETRY_MIN;
if (*timeout > NFS4_POLL_RETRY_MAX)
*timeout = NFS4_POLL_RETRY_MAX;
- freezable_schedule_timeout_killable(*timeout);
+ freezable_schedule_timeout_killable_unsafe(*timeout);
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
*timeout <<= 1;
@@ -762,6 +818,7 @@ struct nfs4_opendata {
struct nfs4_string owner_name;
struct nfs4_string group_name;
struct nfs_fattr f_attr;
+ struct nfs4_label *f_label;
struct dentry *dir;
struct dentry *dentry;
struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
static void nfs4_init_opendata_res(struct nfs4_opendata *p)
{
p->o_res.f_attr = &p->f_attr;
+ p->o_res.f_label = p->f_label;
p->o_res.seqid = p->o_arg.seqid;
p->c_res.seqid = p->c_arg.seqid;
p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
const struct iattr *attrs,
+ struct nfs4_label *label,
enum open_claim_type4 claim,
gfp_t gfp_mask)
{
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
goto err;
+
+ p->f_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_label))
+ goto err_free_p;
+
p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
if (p->o_arg.seqid == NULL)
- goto err_free;
+ goto err_free_label;
nfs_sb_active(dentry->d_sb);
p->dentry = dget(dentry);
p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
p->o_arg.name = &dentry->d_name;
p->o_arg.server = server;
- p->o_arg.bitmask = server->attr_bitmask;
+ p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+ p->o_arg.label = label;
p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
switch (p->o_arg.claim) {
case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
nfs4_init_opendata_res(p);
kref_init(&p->kref);
return p;
-err_free:
+
+err_free_label:
+ nfs4_label_free(p->f_label);
+err_free_p:
kfree(p);
err:
dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
if (p->state != NULL)
nfs4_put_open_state(p->state);
nfs4_put_state_owner(p->owner);
+
+ nfs4_label_free(p->f_label);
+
dput(p->dir);
dput(p->dentry);
nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (ret)
goto err;
+ nfs_setsecurity(inode, &data->f_attr, data->f_label);
+
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
ret = -EAGAIN;
if (!(data->f_attr.valid & NFS_ATTR_FATTR))
goto err;
- inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
ret = PTR_ERR(inode);
if (IS_ERR(inode))
goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
struct nfs4_opendata *opendata;
opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
- NULL, claim, GFP_NOFS);
+ NULL, NULL, claim, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
- _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+ _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
return 0;
}
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->stateid;
- int status;
+ struct nfs_delegation *delegation;
+ struct rpc_cred *cred = NULL;
+ int status = -NFS4ERR_BAD_STATEID;
/* If a state reset has been done, test_stateid is unneeded */
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
return;
- status = nfs41_test_stateid(server, stateid);
+ /* Get the delegation credential for use by test/free_stateid */
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match(&delegation->stateid, stateid)) {
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, stateid, cred);
+ } else
+ rcu_read_unlock();
+
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
nfs_remove_bad_delegation(state->inode);
write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
write_sequnlock(&state->seqlock);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
}
+
+ if (cred != NULL)
+ put_rpccred(cred);
}
/**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid *stateid = &state->open_stateid;
+ struct rpc_cred *cred = state->owner->so_cred;
int status;
/* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
(test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
return -NFS4ERR_BAD_STATEID;
- status = nfs41_test_stateid(server, stateid);
+ status = nfs41_test_stateid(server, stateid, cred);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid);
+ nfs41_free_stateid(server, stateid, cred);
clear_bit(NFS_O_RDONLY_STATE, &state->flags);
clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
fmode_t fmode,
int flags,
- struct nfs4_state **res)
+ struct nfs_open_context *ctx)
{
struct nfs4_state_owner *sp = opendata->owner;
struct nfs_server *server = sp->so_server;
+ struct dentry *dentry;
struct nfs4_state *state;
unsigned int seq;
int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (server->caps & NFS_CAP_POSIX_LOCK)
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ dentry = opendata->dentry;
+ if (dentry->d_inode == NULL) {
+ /* FIXME: Is this d_drop() ever needed? */
+ d_drop(dentry);
+ dentry = d_add_unique(dentry, igrab(state->inode));
+ if (dentry == NULL) {
+ dentry = opendata->dentry;
+ } else if (dentry != ctx->dentry) {
+ dput(ctx->dentry);
+ ctx->dentry = dget(dentry);
+ }
+ nfs_set_verifier(dentry,
+ nfs_save_change_attribute(opendata->dir->d_inode));
+ }
+
ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
if (ret != 0)
goto out;
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
- nfs4_schedule_stateid_recovery(server, state);
- *res = state;
+ ctx->state = state;
+ if (dentry->d_inode == state->inode) {
+ nfs_inode_attach_open_context(ctx);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ nfs4_schedule_stateid_recovery(server, state);
+ }
out:
return ret;
}
@@ -1978,19 +2086,21 @@ out:
* Returns a referenced nfs4_state
*/
static int _nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_state **res,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
struct nfs4_state_owner *sp;
struct nfs4_state *state = NULL;
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_opendata *opendata;
+ struct dentry *dentry = ctx->dentry;
+ struct rpc_cred *cred = ctx->cred;
+ struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+ fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct nfs4_label *olabel = NULL;
int status;
/* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
if (dentry->d_inode)
claim = NFS4_OPEN_CLAIM_FH;
opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
- claim, GFP_KERNEL);
+ label, claim, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
+ if (label) {
+ olabel = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = PTR_ERR(olabel);
+ goto err_opendata_put;
+ }
+ }
+
if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
if (!opendata->f_attr.mdsthreshold)
- goto err_opendata_put;
+ goto err_free_label;
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
if (dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
- status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
+ status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
if (status != 0)
- goto err_opendata_put;
+ goto err_free_label;
+ state = ctx->state;
if ((opendata->o_arg.open_flags & O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- state);
- if (status == 0)
+ state, label, olabel);
+ if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
}
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
kfree(opendata->f_attr.mdsthreshold);
opendata->f_attr.mdsthreshold = NULL;
+ nfs4_label_free(olabel);
+
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
- *res = state;
return 0;
+err_free_label:
+ nfs4_label_free(olabel);
err_opendata_put:
kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
out_err:
- *res = NULL;
return status;
}
static struct nfs4_state *nfs4_do_open(struct inode *dir,
- struct dentry *dentry,
- fmode_t fmode,
+ struct nfs_open_context *ctx,
int flags,
struct iattr *sattr,
- struct rpc_cred *cred,
- struct nfs4_threshold **ctx_th)
+ struct nfs4_label *label)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_state *res;
int status;
- fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
do {
- status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
- &res, ctx_th);
+ status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+ res = ctx->state;
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
.iap = sattr,
.server = server,
.bitmask = server->attr_bitmask,
+ .label = ilabel,
};
struct nfs_setattrres res = {
.fattr = fattr,
+ .label = olabel,
.server = server,
};
struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
bool truncate;
int status;
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
nfs_fattr_init(fattr);
/* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state)
+ struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
};
int err;
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+ err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
{
struct nfs4_state *state;
+ struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+ label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
/* Protect against concurrent sillydeletes */
- state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
- ctx->cred, &ctx->mdsthreshold);
+ state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+
+ nfs4_label_release_security(label);
+
if (IS_ERR(state))
return ERR_CAST(state);
- ctx->state = state;
- return igrab(state->inode);
+ return state->inode;
}
static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_CTIME;
if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+ memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+ sizeof(server->attr_bitmask));
+ if (server->caps & NFS_CAP_SECURITY_LABEL) {
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
+ u32 bitmask[3];
struct nfs4_lookup_root_arg args = {
- .bitmask = nfs4_fattr_bitmap,
+ .bitmask = bitmask,
};
struct nfs4_lookup_res res = {
.server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
+ bitmask[0] = nfs4_fattr_bitmap[0];
+ bitmask[1] = nfs4_fattr_bitmap[1];
+ /*
+ * Process the label in the upcoming getfattr
+ */
+ bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
nfs_fattr_init(info->fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
+ struct nfs4_label *label = NULL;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- error = nfs4_proc_getattr(server, mntfh, fattr);
+ label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ error = nfs4_proc_getattr(server, mntfh, fattr, label);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
- return error;
+ goto err_free_label;
}
if (fattr->valid & NFS_ATTR_FATTR_FSID &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid))
memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+err_free_label:
+ nfs4_label_free(label);
+
return error;
}
@@ -2711,7 +2869,8 @@ out:
return status;
}
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_getattr_arg args = {
.fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
struct nfs4_getattr_res res = {
.fattr = fattr,
+ .label = label,
.server = server,
};
struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
-
+
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getattr(server, fhandle, fattr),
+ _nfs4_proc_getattr(server, fhandle, fattr, label),
&exception);
} while (exception.retry);
return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct inode *inode = dentry->d_inode;
struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
+ struct nfs4_label *label = NULL;
int status;
if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2776,10 +2940,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
/* Deal with open(O_TRUNC) */
if (sattr->ia_valid & ATTR_OPEN)
- sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+ sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
/* Optimization: if the end result is no change, don't RPC */
- if ((sattr->ia_valid & ~(ATTR_FILE)) == 0)
+ if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
/* Search for an existing open(O_WRITE) file */
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
}
- status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
- if (status == 0)
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+ if (status == 0) {
nfs_setattr_update_inode(inode, sattr);
+ nfs_setsecurity(inode, fattr, label);
+ }
+ nfs4_label_free(label);
return status;
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
const struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct nfs4_lookup_res res = {
.server = server,
.fattr = fattr,
+ .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
.rpc_resp = &res,
};
+ args.bitmask = nfs4_bitmask(server, label);
+
nfs_fattr_init(fattr);
dprintk("NFS call lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
struct qstr *name, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct nfs4_exception exception = { };
struct rpc_clnt *client = *clnt;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+ err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
switch (err) {
case -NFS4ERR_BADNAME:
err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
}
static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -2896,15 +3071,13 @@ struct rpc_clnt *
nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
+ struct rpc_clnt *client = NFS_CLIENT(dir);
int status;
- struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
- status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
- if (status < 0) {
- rpc_shutdown_client(client);
+ status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
+ if (status < 0)
return ERR_PTR(status);
- }
- return client;
+ return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
}
static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
@@ -2924,7 +3097,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
.rpc_cred = entry->cred,
};
int mode = entry->mask;
- int status;
+ int status = 0;
/*
* Determine which access bits we want to ask for...
@@ -3029,6 +3202,7 @@ static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -3037,19 +3211,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
+ ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, dentry, ctx->mode,
- flags, sattr, ctx->cred,
- &ctx->mdsthreshold);
- d_drop(dentry);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
}
- d_add(dentry, igrab(state->inode));
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- ctx->state = state;
out:
+ nfs4_label_release_security(ilabel);
put_nfs_open_context(ctx);
return status;
}
@@ -3098,6 +3269,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+ nfs_fattr_init(res->dir_attr);
}
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3346,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
.rpc_resp = &res,
};
int status = -ENOMEM;
-
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3380,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
};
struct nfs4_link_res res = {
.server = server,
+ .label = NULL,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3393,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
if (res.fattr == NULL)
goto out;
+ res.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(res.label)) {
+ status = PTR_ERR(res.label);
+ goto out;
+ }
+ arg.bitmask = nfs4_bitmask(server, res.label);
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(dir, &res.cinfo);
- nfs_post_op_update_inode(inode, res.fattr);
+ status = nfs_post_op_update_inode(inode, res.fattr);
+ if (!status)
+ nfs_setsecurity(inode, res.fattr, res.label);
}
+
+
+ nfs4_label_free(res.label);
+
out:
nfs_free_fattr(res.fattr);
return status;
@@ -3247,6 +3434,7 @@ struct nfs4_createdata {
struct nfs4_create_res res;
struct nfs_fh fh;
struct nfs_fattr fattr;
+ struct nfs4_label *label;
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3446,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
if (data != NULL) {
struct nfs_server *server = NFS_SERVER(dir);
+ data->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->label))
+ goto out_free;
+
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
data->msg.rpc_argp = &data->arg;
data->msg.rpc_resp = &data->res;
@@ -3266,13 +3458,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.name = name;
data->arg.attrs = sattr;
data->arg.ftype = ftype;
- data->arg.bitmask = server->attr_bitmask;
+ data->arg.bitmask = nfs4_bitmask(server, data->label);
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
+ data->res.label = data->label;
nfs_fattr_init(data->res.fattr);
}
return data;
+out_free:
+ kfree(data);
+ return NULL;
}
static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3477,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
update_changeattr(dir, &data->res.dir_cinfo);
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
}
return status;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
+ nfs4_label_free(data->label);
kfree(data);
}
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
- struct page *page, unsigned int len, struct iattr *sattr)
+ struct page *page, unsigned int len, struct iattr *sattr,
+ struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENAMETOOLONG;
@@ -3308,6 +3506,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
data->arg.u.symlink.pages = &page;
data->arg.u.symlink.len = len;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
@@ -3320,18 +3519,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
_nfs4_proc_symlink(dir, dentry, page,
- len, sattr),
+ len, sattr, label),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
return err;
}
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+ struct iattr *sattr, struct nfs4_label *label)
{
struct nfs4_createdata *data;
int status = -ENOMEM;
@@ -3340,6 +3545,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (data == NULL)
goto out;
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
nfs4_free_createdata(data);
@@ -3351,14 +3557,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mkdir(dir, dentry, sattr),
+ _nfs4_proc_mkdir(dir, dentry, sattr, label),
&exception);
} while (exception.retry);
+ nfs4_label_release_security(label);
+
return err;
}
@@ -3416,7 +3627,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
}
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, dev_t rdev)
+ struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
{
struct nfs4_createdata *data;
int mode = sattr->ia_mode;
@@ -3441,7 +3652,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
status = -EINVAL;
goto out_free;
}
-
+
+ data->arg.label = label;
status = nfs4_do_create(dir, dentry, data);
out_free:
nfs4_free_createdata(data);
@@ -3453,14 +3665,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
struct nfs4_exception exception = { };
+ struct nfs4_label l, *label = NULL;
int err;
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
sattr->ia_mode &= ~current_umask();
do {
err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mknod(dir, dentry, sattr, rdev),
+ _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
&exception);
} while (exception.retry);
+
+ nfs4_label_release_security(label);
+
return err;
}
@@ -4187,6 +4405,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
return err;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr fattr;
+ struct nfs4_label label = {0, 0, buflen, buf};
+
+ u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs4_getattr_arg args = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = &fattr,
+ .label = &label,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+
+ nfs_fattr_init(&fattr);
+
+ ret = rpc_call_sync(server->client, &msg, 0);
+ if (ret)
+ return ret;
+ if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+ return -ENOENT;
+ if (buflen < label.len)
+ return -ERANGE;
+ return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_get_security_label(inode, buf, buflen),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+
+ struct iattr sattr = {0};
+ struct nfs_server *server = NFS_SERVER(inode);
+ const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_setattrargs args = {
+ .fh = NFS_FH(inode),
+ .iap = &sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ nfs4_stateid_copy(&args.stateid, &zero_stateid);
+
+ status = rpc_call_sync(server->client, &msg, 0);
+ if (status)
+ dprintk("%s failed: %d\n", __func__, status);
+
+ return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_do_set_security_label(inode, ilabel,
+ fattr, olabel),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+ struct nfs4_label ilabel, *olabel = NULL;
+ struct nfs_fattr fattr;
+ struct rpc_cred *cred;
+ struct inode *inode = dentry->d_inode;
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ nfs_fattr_init(&fattr);
+
+ ilabel.pi = 0;
+ ilabel.lfs = 0;
+ ilabel.label = (char *)buf;
+ ilabel.len = buflen;
+
+ cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+
+ olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = -PTR_ERR(olabel);
+ goto out;
+ }
+
+ status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ if (status == 0)
+ nfs_setsecurity(inode, &fattr, olabel);
+
+ nfs4_label_free(olabel);
+out:
+ put_rpccred(cred);
+ return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
static int
nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
{
@@ -4293,10 +4660,14 @@ static unsigned int
nfs4_init_uniform_client_string(const struct nfs_client *clp,
char *buf, size_t len)
{
- char *nodename = clp->cl_rpcclient->cl_nodename;
+ const char *nodename = clp->cl_rpcclient->cl_nodename;
if (nfs4_client_id_uniquifier[0] != '\0')
- nodename = nfs4_client_id_uniquifier;
+ return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version,
+ clp->cl_minorversion,
+ nfs4_client_id_uniquifier,
+ nodename);
return scnprintf(buf, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
nodename);
@@ -4345,7 +4716,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
/* cb_client4 */
rcu_read_lock();
setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
- sizeof(setclientid.sc_netid),
+ sizeof(setclientid.sc_netid), "%s",
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_NETID));
rcu_read_unlock();
@@ -4528,7 +4899,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
static unsigned long
nfs4_set_lock_task_retry(unsigned long timeout)
{
- freezable_schedule_timeout_killable(timeout);
+ freezable_schedule_timeout_killable_unsafe(timeout);
timeout <<= 1;
if (timeout > NFS4_LOCK_MAXTIMEOUT)
return NFS4_LOCK_MAXTIMEOUT;
@@ -5056,13 +5427,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
list_for_each_entry(lsp, &state->lock_states, ls_locks) {
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- status = nfs41_test_stateid(server, &lsp->ls_stateid);
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+ status = nfs41_test_stateid(server,
+ &lsp->ls_stateid,
+ cred);
if (status != NFS_OK) {
/* Free the stateid unless the server
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
nfs41_free_stateid(server,
- &lsp->ls_stateid);
+ &lsp->ls_stateid,
+ cred);
clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
ret = status;
}
@@ -5295,6 +5671,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
return len;
}
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+ return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_set_security_label(dentry, buf, buflen);
+
+ return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+ void *buf, size_t buflen, int type)
+{
+ if (security_ismaclabel(key))
+ return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+ return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+ size_t list_len, const char *name,
+ size_t name_len, int type)
+{
+ size_t len = 0;
+
+ if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+ if (list && len <= list_len)
+ security_inode_listsecurity(dentry->d_inode, list, len);
+ }
+ return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = nfs4_xattr_list_nfs4_label,
+ .get = nfs4_xattr_get_nfs4_label,
+ .set = nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
*/
@@ -5318,7 +5741,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
struct page *page)
{
struct nfs_server *server = NFS_SERVER(dir);
- u32 bitmask[2] = {
+ u32 bitmask[3] = {
[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
};
struct nfs4_fs_locations_arg args = {
@@ -5369,6 +5792,10 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
return err;
}
+/**
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
{
int status;
@@ -5384,9 +5811,10 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
.rpc_argp = &args,
.rpc_resp = &res,
};
+ struct rpc_clnt *clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
dprintk("NFS call secinfo %s\n", name->name);
- status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+ status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
dprintk("NFS reply secinfo: %d\n", status);
return status;
}
@@ -5505,7 +5933,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
struct nfs41_exchange_id_args args = {
.verifier = &verifier,
.client = clp,
- .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+ .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID,
};
struct nfs41_exchange_id_res res = {
0
@@ -5643,7 +6072,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
goto out;
if (clp->cl_preserve_clid)
goto out;
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_destroy_clientid(clp, cred);
if (cred)
put_rpccred(cred);
@@ -5762,17 +6191,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
*/
static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
{
- struct nfs4_session *session = args->client->cl_session;
- unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
- mxresp_sz = session->fc_target_max_resp_sz;
+ unsigned int max_rqst_sz, max_resp_sz;
+
+ max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+ max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
- if (mxrqst_sz == 0)
- mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
- if (mxresp_sz == 0)
- mxresp_sz = NFS_MAX_FILE_IO_SIZE;
/* Fore channel attributes */
- args->fc_attrs.max_rqst_sz = mxrqst_sz;
- args->fc_attrs.max_resp_sz = mxresp_sz;
+ args->fc_attrs.max_rqst_sz = max_rqst_sz;
+ args->fc_attrs.max_resp_sz = max_resp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
args->fc_attrs.max_reqs = max_session_slots;
@@ -6159,12 +6585,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
/*
* Issue a global reclaim complete.
*/
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+ struct rpc_cred *cred)
{
struct nfs4_reclaim_complete_data *calldata;
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6776,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
.rpc_argp = &lgp->args,
.rpc_resp = &lgp->res,
+ .rpc_cred = lgp->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = server->client,
@@ -6451,9 +6880,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
.rpc_argp = &lrp->args,
.rpc_resp = &lrp->res,
+ .rpc_cred = lrp->cred,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_client = NFS_SERVER(lrp->args.inode)->client,
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
@@ -6520,7 +6950,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_getdeviceinfo_args args = {
.pdev = pdev,
@@ -6532,6 +6964,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
int status;
@@ -6542,14 +6975,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
return status;
}
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_proc_getdeviceinfo(server, pdev),
+ _nfs4_proc_getdeviceinfo(server, pdev, cred),
&exception);
} while (exception.retry);
return err;
@@ -6653,6 +7088,10 @@ out:
return status;
}
+/**
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
static int
_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
@@ -6668,7 +7107,8 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
- return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ return nfs4_call_sync(server->nfs_client->cl_rpcclient, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
}
static int
@@ -6733,7 +7173,9 @@ out:
return err;
}
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
int status;
struct nfs41_test_stateid_args args = {
@@ -6744,6 +7186,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
.rpc_argp = &args,
.rpc_resp = &res,
+ .rpc_cred = cred,
};
dprintk("NFS call test_stateid %p\n", stateid);
@@ -6764,17 +7207,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to test
+ * @cred: credential
*
* Returns NFS_OK if the server recognizes that "stateid" is valid.
* Otherwise a negative NFS4ERR value is returned if the operation
* failed or the state ID is not currently valid.
*/
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs41_test_stateid(server, stateid);
+ err = _nfs41_test_stateid(server, stateid, cred);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7269,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
+ struct rpc_cred *cred,
bool privileged)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup = {
.rpc_client = server->client,
@@ -6859,16 +7307,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
*
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
+ * @cred: credential
*
* Returns NFS_OK if the server freed "stateid". Otherwise a
* negative NFS4ERR value is returned.
*/
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
{
struct rpc_task *task;
int ret;
- task = _nfs41_free_stateid(server, stateid, true);
+ task = _nfs41_free_stateid(server, stateid, cred, true);
if (IS_ERR(task))
return PTR_ERR(task);
ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7332,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct rpc_task *task;
+ struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
- task = _nfs41_free_stateid(server, &lsp->ls_stateid, false);
+ task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -6919,7 +7371,6 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
.recover_open = nfs4_open_reclaim,
.recover_lock = nfs4_lock_reclaim,
.establish_clid = nfs4_init_clientid,
- .get_clid_cred = nfs4_get_setclientid_cred,
.detect_trunking = nfs40_discover_server_trunking,
};
@@ -6930,7 +7381,6 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
.recover_open = nfs4_open_reclaim,
.recover_lock = nfs4_lock_reclaim,
.establish_clid = nfs41_init_clientid,
- .get_clid_cred = nfs4_get_exchange_id_cred,
.reclaim_complete = nfs41_proc_reclaim_complete,
.detect_trunking = nfs41_discover_server_trunking,
};
@@ -6942,7 +7392,6 @@ static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.recover_open = nfs4_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
- .get_clid_cred = nfs4_get_setclientid_cred,
};
#if defined(CONFIG_NFS_V4_1)
@@ -6952,7 +7401,6 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
.recover_open = nfs41_open_expired,
.recover_lock = nfs41_lock_expired,
.establish_clid = nfs41_init_clientid,
- .get_clid_cred = nfs4_get_exchange_id_cred,
};
#endif /* CONFIG_NFS_V4_1 */
@@ -7004,11 +7452,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
};
#endif
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+ .minor_version = 2,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_CHANGE_ATTR
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1,
+ .call_sync = nfs4_call_sync_sequence,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
[0] = &nfs_v4_0_minor_ops,
#if defined(CONFIG_NFS_V4_1)
[1] = &nfs_v4_1_minor_ops,
#endif
+#if defined(CONFIG_NFS_V4_2)
+ [2] = &nfs_v4_2_minor_ops,
+#endif
};
const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7578,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ &nfs4_xattr_nfs4_label_handler,
+#endif
NULL
};
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..202e3633d555 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -441,7 +441,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
struct rpc_xprt *xprt;
struct rpc_cred *cred;
- cred = nfs4_get_exchange_id_cred(session->clp);
+ cred = nfs4_get_clid_cred(session->clp);
nfs4_proc_destroy_session(session, cred);
if (cred)
put_rpccred(cred);
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
return 0;
}
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
{
- struct nfs_client *clp = server->nfs_client;
- struct nfs4_session *session;
- unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
- unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
-
if (!nfs4_has_session(clp))
return 0;
- if (server->rsize != 0)
- target_max_resp_sz = server->rsize;
- target_max_resp_sz += nfs41_maxread_overhead;
-
- if (server->wsize != 0)
- target_max_rqst_sz = server->wsize;
- target_max_rqst_sz += nfs41_maxwrite_overhead;
-
- session = clp->cl_session;
- spin_lock(&clp->cl_lock);
- if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
- /* Initialise targets and channel attributes */
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
- session->fc_target_max_resp_sz = target_max_resp_sz;
- session->fc_attrs.max_resp_sz = target_max_resp_sz;
- } else {
- /* Just adjust the targets */
- if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
- session->fc_target_max_rqst_sz = target_max_rqst_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- if (target_max_resp_sz > session->fc_target_max_resp_sz) {
- session->fc_target_max_resp_sz = target_max_resp_sz;
- set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
- }
- }
- spin_unlock(&clp->cl_lock);
-
- if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- nfs4_schedule_lease_recovery(clp);
-
+ clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
return nfs41_check_session_ready(clp);
}
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..86a066926f24 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -8,7 +8,7 @@
#define __LINUX_FS_NFS_NFS4SESSION_H
/* maximum number of slots to use */
-#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
+#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -66,9 +66,6 @@ struct nfs4_session {
struct nfs4_channel_attrs bc_attrs;
struct nfs4_slot_table bc_slot_table;
struct nfs_client *clp;
- /* Create session arguments */
- unsigned int fc_target_max_rqst_sz;
- unsigned int fc_target_max_resp_sz;
};
enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
#else /* defined(CONFIG_NFS_V4_1) */
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
{
return 0;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ff10b4aa534c..6818964bb7c0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -154,6 +154,19 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
return cred;
}
+static void nfs4_root_machine_cred(struct nfs_client *clp)
+{
+ struct rpc_cred *cred, *new;
+
+ new = rpc_lookup_machine_cred(NULL);
+ spin_lock(&clp->cl_lock);
+ cred = clp->cl_machine_cred;
+ clp->cl_machine_cred = new;
+ spin_unlock(&clp->cl_lock);
+ if (cred != NULL)
+ put_rpccred(cred);
+}
+
static struct rpc_cred *
nfs4_get_renew_cred_server_locked(struct nfs_server *server)
{
@@ -228,19 +241,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
return status;
}
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
{
- struct nfs4_session *ses = clp->cl_session;
- struct nfs4_slot_table *tbl;
-
- if (ses == NULL)
- return;
- tbl = &ses->fc_slot_table;
if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_lock(&tbl->slot_tbl_lock);
nfs41_wake_slot_table(tbl);
@@ -248,6 +250,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
}
}
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
/*
* Signal state manager thread if session fore channel is drained
*/
@@ -340,62 +352,21 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
return nfs41_walk_client_list(clp, result, cred);
}
-struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
-{
- struct rpc_cred *cred;
-
- spin_lock(&clp->cl_lock);
- cred = nfs4_get_machine_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
- return cred;
-}
-
#endif /* CONFIG_NFS_V4_1 */
-static struct rpc_cred *
-nfs4_get_setclientid_cred_server(struct nfs_server *server)
-{
- struct nfs_client *clp = server->nfs_client;
- struct rpc_cred *cred = NULL;
- struct nfs4_state_owner *sp;
- struct rb_node *pos;
-
- spin_lock(&clp->cl_lock);
- pos = rb_first(&server->state_owners);
- if (pos != NULL) {
- sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
- cred = get_rpccred(sp->so_cred);
- }
- spin_unlock(&clp->cl_lock);
- return cred;
-}
-
/**
- * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * nfs4_get_clid_cred - Acquire credential for a setclientid operation
* @clp: client state handle
*
* Returns an rpc_cred with reference count bumped, or NULL.
*/
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
{
- struct nfs_server *server;
struct rpc_cred *cred;
spin_lock(&clp->cl_lock);
cred = nfs4_get_machine_cred_locked(clp);
spin_unlock(&clp->cl_lock);
- if (cred != NULL)
- goto out;
-
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- cred = nfs4_get_setclientid_cred_server(server);
- if (cred != NULL)
- break;
- }
- rcu_read_unlock();
-
-out:
return cred;
}
@@ -1194,7 +1165,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
snprintf(buf, sizeof(buf), "%s-manager",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
rcu_read_unlock();
- task = kthread_run(nfs4_run_state_manager, clp, buf);
+ task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
if (IS_ERR(task)) {
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
@@ -1563,11 +1534,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
}
static void nfs4_reclaim_complete(struct nfs_client *clp,
- const struct nfs4_state_recovery_ops *ops)
+ const struct nfs4_state_recovery_ops *ops,
+ struct rpc_cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
- (void)ops->reclaim_complete(clp);
+ (void)ops->reclaim_complete(clp, cred);
}
static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1584,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
+ const struct nfs4_state_recovery_ops *ops;
+ struct rpc_cred *cred;
+
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
- nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+ ops = clp->cl_mvops->reboot_recovery_ops;
+ cred = nfs4_get_clid_cred(clp);
+ nfs4_reclaim_complete(clp, ops, cred);
+ put_rpccred(cred);
}
static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1726,7 +1704,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
cred = ops->get_state_renewal_cred_locked(clp);
spin_unlock(&clp->cl_lock);
if (cred == NULL) {
- cred = nfs4_get_setclientid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
status = -ENOKEY;
if (cred == NULL)
goto out;
@@ -1798,7 +1776,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
clp->cl_mvops->reboot_recovery_ops;
int status;
- cred = ops->get_clid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
return -ENOENT;
status = ops->establish_clid(clp, cred);
@@ -1872,7 +1850,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
mutex_lock(&nfs_clid_init_mutex);
again:
status = -ENOENT;
- cred = ops->get_clid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
goto out_unlock;
@@ -1890,7 +1868,11 @@ again:
__func__, status);
goto again;
case -EACCES:
- if (i++)
+ if (i++ == 0) {
+ nfs4_root_machine_cred(clp);
+ goto again;
+ }
+ if (i > 2)
break;
case -NFS4ERR_CLID_INUSE:
case -NFS4ERR_WRONGSEC:
@@ -2046,7 +2028,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
if (!nfs4_has_session(clp))
return 0;
nfs4_begin_drain_session(clp);
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
status = nfs4_proc_destroy_session(clp->cl_session, cred);
switch (status) {
case 0:
@@ -2089,7 +2071,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
if (!nfs4_has_session(clp))
return 0;
nfs4_begin_drain_session(clp);
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_bind_conn_to_session(clp, cred);
if (cred)
put_rpccred(cred);
@@ -2110,7 +2092,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
}
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static void nfs4_end_drain_session(struct nfs_client *clp) { }
static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
#include "delegation.h"
#include "internal.h"
#include "nfs4_fs.h"
+#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
{
int err;
- err = nfs_idmap_init();
+ err = nfs_dns_resolver_init();
if (err)
goto out;
- err = nfs4_register_sysctl();
+ err = nfs_idmap_init();
if (err)
goto out1;
+ err = nfs4_register_sysctl();
+ if (err)
+ goto out2;
+
register_nfs_version(&nfs_v4);
return 0;
-out1:
+out2:
nfs_idmap_quit();
+out1:
+ nfs_dns_resolver_destroy();
out:
return err;
}
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
unregister_nfs_version(&nfs_v4);
nfs4_unregister_sysctl();
nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
}
MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..1a4a3bd415ed 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define nfs4_label_maxsz 0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
/* We support only one layout type per file system */
#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
/* This is based on getfattr, which uses the most attributes: */
#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
3 + 3 + 3 + nfs4_owner_maxsz + \
- nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+ nfs4_group_maxsz + nfs4_label_maxsz + \
+ decode_mdsthreshold_maxsz))
#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
nfs4_fattr_value_maxsz)
#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
1 + 2 + 1 + \
nfs4_owner_maxsz + \
nfs4_group_maxsz + \
+ nfs4_label_maxsz + \
4 + 4)
#define encode_savefh_maxsz (op_encode_hdr_maxsz)
#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
encode_stateid_maxsz + 3)
#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
- 2 + encode_verifier_maxsz + 5)
+ 2 + encode_verifier_maxsz + 5 + \
+ nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + \
+ nfs4_label_maxsz + nfs4_fattr_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
decode_sequence_maxsz +
decode_putfh_maxsz) *
XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz) *
+ XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
#endif /* CONFIG_NFS_V4_1 */
static const umode_t nfs_type2fmt[] = {
@@ -968,32 +988,35 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
}
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+ const struct nfs4_label *label,
+ const struct nfs_server *server)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
int owner_namelen = 0;
int owner_grouplen = 0;
__be32 *p;
- __be32 *q;
- int len;
- uint32_t bmval0 = 0;
- uint32_t bmval1 = 0;
+ unsigned i;
+ uint32_t len = 0;
+ uint32_t bmval_len;
+ uint32_t bmval[3] = { 0 };
/*
* We reserve enough space to write the entire attribute buffer at once.
* In the worst-case, this would be
- * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
- * = 36 bytes, plus any contribution from variable-length fields
+ * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ * = 40 bytes, plus any contribution from variable-length fields
* such as owner/group.
*/
- len = 16;
-
- /* Sigh */
- if (iap->ia_valid & ATTR_SIZE)
+ if (iap->ia_valid & ATTR_SIZE) {
+ bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
- if (iap->ia_valid & ATTR_MODE)
+ }
+ if (iap->ia_valid & ATTR_MODE) {
+ bmval[1] |= FATTR4_WORD1_MODE;
len += 4;
+ }
if (iap->ia_valid & ATTR_UID) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
@@ -1004,6 +1027,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
owner_namelen = sizeof("nobody") - 1;
/* goto out; */
}
+ bmval[1] |= FATTR4_WORD1_OWNER;
len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
}
if (iap->ia_valid & ATTR_GID) {
@@ -1015,75 +1039,72 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
owner_grouplen = sizeof("nobody") - 1;
/* goto out; */
}
+ bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
}
- if (iap->ia_valid & ATTR_ATIME_SET)
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
len += 16;
- else if (iap->ia_valid & ATTR_ATIME)
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
len += 4;
- if (iap->ia_valid & ATTR_MTIME_SET)
+ }
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 16;
- else if (iap->ia_valid & ATTR_MTIME)
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
- p = reserve_space(xdr, len);
+ }
+ if (label) {
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+ bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ }
- /*
- * We write the bitmap length now, but leave the bitmap and the attribute
- * buffer length to be backfilled at the end of this routine.
- */
- *p++ = cpu_to_be32(2);
- q = p;
- p += 3;
+ if (bmval[2] != 0)
+ bmval_len = 3;
+ else if (bmval[1] != 0)
+ bmval_len = 2;
+ else
+ bmval_len = 1;
- if (iap->ia_valid & ATTR_SIZE) {
- bmval0 |= FATTR4_WORD0_SIZE;
+ p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len);
+
+ *p++ = cpu_to_be32(bmval_len);
+ for (i = 0; i < bmval_len; i++)
+ *p++ = cpu_to_be32(bmval[i]);
+ *p++ = cpu_to_be32(len);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE)
p = xdr_encode_hyper(p, iap->ia_size);
- }
- if (iap->ia_valid & ATTR_MODE) {
- bmval1 |= FATTR4_WORD1_MODE;
+ if (bmval[1] & FATTR4_WORD1_MODE)
*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
- }
- if (iap->ia_valid & ATTR_UID) {
- bmval1 |= FATTR4_WORD1_OWNER;
+ if (bmval[1] & FATTR4_WORD1_OWNER)
p = xdr_encode_opaque(p, owner_name, owner_namelen);
- }
- if (iap->ia_valid & ATTR_GID) {
- bmval1 |= FATTR4_WORD1_OWNER_GROUP;
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
- if (iap->ia_valid & ATTR_ATIME_SET) {
- bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
- }
- else if (iap->ia_valid & ATTR_ATIME) {
- bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
- }
- if (iap->ia_valid & ATTR_MTIME_SET) {
- bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
- }
- else if (iap->ia_valid & ATTR_MTIME) {
- bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
-
- /*
- * Now we backfill the bitmap and the attribute buffer length.
- */
- if (len != ((char *)p - (char *)q) + 4) {
- printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
- len, ((char *)p - (char *)q) + 4);
- BUG();
+ if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ *p++ = cpu_to_be32(label->lfs);
+ *p++ = cpu_to_be32(label->pi);
+ *p++ = cpu_to_be32(label->len);
+ p = xdr_encode_opaque_fixed(p, label->label, label->len);
}
- len = (char *)p - (char *)q - 12;
- *q++ = htonl(bmval0);
- *q++ = htonl(bmval1);
- *q = htonl(len);
/* out: */
}
@@ -1136,7 +1157,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1209,10 @@ encode_getattr_three(struct xdr_stream *xdr,
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
- bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+ encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+ bitmask[1] & nfs4_fattr_bitmap[1],
+ bitmask[2] & nfs4_fattr_bitmap[2],
+ hdr);
}
static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1390,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1404,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->server);
+ encode_attrs(xdr, &dummy, arg->label, arg->server);
}
}
@@ -1532,7 +1555,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
- uint32_t attrs[2] = {
+ uint32_t attrs[3] = {
FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
@@ -1555,20 +1578,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
encode_uint64(xdr, readdir->cookie);
encode_nfs4_verifier(xdr, &readdir->verifier);
- p = reserve_space(xdr, 20);
+ p = reserve_space(xdr, encode_readdir_space);
*p++ = cpu_to_be32(dircount);
*p++ = cpu_to_be32(readdir->count);
- *p++ = cpu_to_be32(2);
-
+ *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
- *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+ if (encode_readdir_bitmask_sz > 2) {
+ if (hdr->minorversion > 1)
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+ }
memcpy(verf, readdir->verifier.data, sizeof(verf));
- dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+
+ dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
__func__,
(unsigned long long)readdir->cookie,
verf[0], verf[1],
attrs[0] & readdir->bitmask[0],
- attrs[1] & readdir->bitmask[1]);
+ attrs[1] & readdir->bitmask[1],
+ attrs[2] & readdir->bitmask[2]);
}
static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1656,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, server);
+ encode_attrs(xdr, arg->iap, arg->label, server);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1918,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
- *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
+ *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
*p++ = cpu_to_be32(0); /* bitmap length 0 */
}
@@ -4038,6 +4067,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
return status;
}
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_label *label)
+{
+ uint32_t pi = 0;
+ uint32_t lfs = 0;
+ __u32 len;
+ __be32 *p;
+ int status = 0;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ lfs = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ pi = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (len < NFS4_MAXLABELLEN) {
+ if (label) {
+ memcpy(label->label, p, len);
+ label->len = len;
+ label->pi = pi;
+ label->lfs = lfs;
+ status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+ }
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ } else
+ printk(KERN_WARNING "%s: label too long (%u)!\n",
+ __func__, len);
+ }
+ if (label && label->label)
+ dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+ (char *)label->label, label->len, label->pi, label->lfs);
+ return status;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
{
int status = 0;
@@ -4380,7 +4459,7 @@ out_overflow:
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- struct nfs4_fs_locations *fs_loc,
+ struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
const struct nfs_server *server)
{
int status;
@@ -4488,6 +4567,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
+ if (label) {
+ status = decode_attr_security_label(xdr, bitmap, label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+ }
+
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
@@ -4495,7 +4581,7 @@ xdr_error:
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
- const struct nfs_server *server)
+ struct nfs4_label *label, const struct nfs_server *server)
{
unsigned int savep;
uint32_t attrlen,
@@ -4514,7 +4600,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+ label, server);
if (status < 0)
goto xdr_error;
@@ -4524,10 +4611,16 @@ xdr_error:
return status;
}
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs4_label *label, const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
}
/*
@@ -5919,7 +6012,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -5945,7 +6038,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr,
+ res->label, res->server);
out:
return status;
}
@@ -6036,7 +6130,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6065,7 +6159,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6097,7 +6191,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6230,7 +6324,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (res->access_request)
decode_access(xdr, &res->access_supported, &res->access_result);
- decode_getfattr(xdr, res->f_attr, res->server);
+ decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
out:
return status;
}
@@ -6307,7 +6401,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
out:
return status;
}
@@ -6696,7 +6790,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
NULL, res->fs_locations,
- res->fs_locations->server);
+ NULL, res->fs_locations->server);
out:
return status;
}
@@ -7109,7 +7203,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
goto out_overflow;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- NULL, entry->server) < 0)
+ NULL, entry->label, entry->server) < 0)
goto out_overflow;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
pd.pgbase = 0;
pd.pglen = PAGE_SIZE;
pd.mincount = 0;
+ pd.maxcount = PAGE_SIZE;
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+ pnfslay->plh_lc_cred);
dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
if (err)
goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
-static inline u64
+static u64
end_offset(u64 start, u64 len)
{
u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_contained(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
* start2 end2
* [----------------)
*/
-static inline int
-lo_seg_intersecting(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
}
static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
- struct pnfs_layout_range *recall_range)
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
{
return (recall_range->iomode == IOMODE_ANY ||
lseg_range->iomode == recall_range->iomode) &&
- lo_seg_intersecting(lseg_range, recall_range);
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
}
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
/* Synchronously retrieve layout information from server and
* store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
lrp->args.inode = ino;
lrp->args.layout = lo;
lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = lo->plh_lc_cred;
status = nfs4_proc_layoutreturn(lrp);
out:
@@ -984,8 +986,8 @@ out:
* are seen first.
*/
static s64
-cmp_layout(struct pnfs_layout_range *l1,
- struct pnfs_layout_range *l2)
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
s64 d;
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+ if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
INIT_LIST_HEAD(&lo->plh_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
- lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+ lo->plh_lc_cred = get_rpccred(ctx->cred);
return lo;
}
@@ -1091,21 +1093,21 @@ out_existing:
* READ READ true
* READ RW true
*/
-static int
-is_matching_lseg(struct pnfs_layout_range *ls_range,
- struct pnfs_layout_range *range)
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+ const struct pnfs_layout_range *range)
{
struct pnfs_layout_range range1;
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
- !lo_seg_intersecting(ls_range, range))
+ !pnfs_lseg_range_intersecting(ls_range, range))
return 0;
/* range1 covers only the first byte in the range */
range1 = *range;
range1.length = 1;
- return lo_seg_contained(ls_range, &range1);
+ return pnfs_lseg_range_contained(ls_range, &range1);
}
/*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(&lseg->pls_range, range)) {
+ pnfs_lseg_range_match(&lseg->pls_range, range)) {
ret = pnfs_get_lseg(lseg);
break;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
struct nfs4_deviceid dev_id;
unsigned int layout_type;
unsigned int mincount;
+ unsigned int maxcount; /* gdia_maxcount */
struct page **pages;
unsigned int pgbase;
- unsigned int pglen;
+ unsigned int pglen; /* reply buffer length */
};
#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
const struct nfs_fh *fh,
struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
- struct pnfs_device *dev);
+ struct pnfs_device *dev,
+ struct rpc_cred *cred);
extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct nfs_fattr *fattr, struct nfs4_label *label)
{
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs_proc_lookup(struct inode *dir, struct qstr *name,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* should fill in the data with a LOOKUP call on the wire.
*/
if (status == 0)
- status = nfs_instantiate(dentry, fh, fattr);
+ status = nfs_instantiate(dentry, fh, fattr, NULL);
out_free:
nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..d26bd63e93ce 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
enum {
Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
- Opt_vers_4_1,
+ Opt_vers_4_1, Opt_vers_4_2,
Opt_vers_err
};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
{ Opt_vers_4, "4" },
{ Opt_vers_4_0, "4.0" },
{ Opt_vers_4_1, "4.1" },
+ { Opt_vers_4_2, "4.2" },
{ Opt_vers_err, NULL }
};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
seq_printf(m, "\n\tnfsv4:\t");
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
show_sessions(m, nfss);
show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
mnt->version = 4;
mnt->minorversion = 1;
break;
+ case Opt_vers_4_2:
+ mnt->version = 4;
+ mnt->minorversion = 2;
+ break;
default:
return 0;
}
@@ -1608,29 +1614,13 @@ out_security_failure:
}
/*
- * Select a security flavor for this mount. The selected flavor
- * is planted in args->auth_flavors[0].
- *
- * Returns 0 on success, -EACCES on failure.
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * the server. Returns 0 if it's ok, and -EACCES if not.
*/
-static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
- struct nfs_mount_request *request)
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+ rpc_authflavor_t *server_authlist, unsigned int count)
{
- unsigned int i, count = *(request->auth_flav_len);
- rpc_authflavor_t flavor;
-
- /*
- * The NFSv2 MNT operation does not return a flavor list.
- */
- if (args->mount_server.version != NFS_MNT3_VERSION)
- goto out_default;
-
- /*
- * Certain releases of Linux's mountd return an empty
- * flavor list in some cases.
- */
- if (count == 0)
- goto out_default;
+ unsigned int i;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
* means that the server will ignore the rpc creds, so any flavor
* can be used.
*/
- if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
- for (i = 0; i < count; i++) {
- if (args->auth_flavors[0] == request->auth_flavs[i] ||
- request->auth_flavs[i] == RPC_AUTH_NULL)
- goto out;
- }
- dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
- args->auth_flavors[0]);
- goto out_err;
- }
-
- /*
- * RFC 2623, section 2.7 suggests we SHOULD prefer the
- * flavor listed first. However, some servers list
- * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
- */
- for (i = 0; i < count; i++) {
- struct rpcsec_gss_info info;
-
- flavor = request->auth_flavs[i];
- switch (flavor) {
- case RPC_AUTH_UNIX:
- goto out_set;
- case RPC_AUTH_NULL:
- continue;
- default:
- if (rpcauth_get_gssinfo(flavor, &info) == 0)
- goto out_set;
- }
- }
-
- /*
- * As a last chance, see if the server list contains AUTH_NULL -
- * if it does, use the default flavor.
- */
for (i = 0; i < count; i++) {
- if (request->auth_flavs[i] == RPC_AUTH_NULL)
- goto out_default;
+ if (args->auth_flavors[0] == server_authlist[i] ||
+ server_authlist[i] == RPC_AUTH_NULL)
+ goto out;
}
- dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
- goto out_err;
+ dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
+ args->auth_flavors[0]);
+ return -EACCES;
-out_default:
- /* use default if flavor not already set */
- flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
- RPC_AUTH_UNIX : args->auth_flavors[0];
-out_set:
- args->auth_flavors[0] = flavor;
out:
- dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
return 0;
-out_err:
- return -EACCES;
}
/*
@@ -1701,10 +1650,10 @@ out_err:
* corresponding to the provided path.
*/
static int nfs_request_mount(struct nfs_parsed_mount_data *args,
- struct nfs_fh *root_fh)
+ struct nfs_fh *root_fh,
+ rpc_authflavor_t *server_authlist,
+ unsigned int *server_authlist_len)
{
- rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
- unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
struct nfs_mount_request request = {
.sap = (struct sockaddr *)
&args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
.protocol = args->mount_server.protocol,
.fh = root_fh,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
- .auth_flav_len = &server_authlist_len,
+ .auth_flav_len = server_authlist_len,
.auth_flavs = server_authlist,
.net = args->net,
};
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
return status;
}
- return nfs_select_flavor(args, &request);
+ return 0;
}
-struct dentry *nfs_try_mount(int flags, const char *dev_name,
- struct nfs_mount_info *mount_info,
- struct nfs_subversion *nfs_mod)
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+ struct nfs_subversion *nfs_mod)
{
int status;
- struct nfs_server *server;
+ unsigned int i;
+ bool tried_auth_unix = false;
+ bool auth_null_in_list = false;
+ struct nfs_server *server = ERR_PTR(-EACCES);
+ struct nfs_parsed_mount_data *args = mount_info->parsed;
+ rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+ unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+ status = nfs_request_mount(args, mount_info->mntfh, authlist,
+ &authlist_len);
+ if (status)
+ return ERR_PTR(status);
- if (mount_info->parsed->need_mount) {
- status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
+ /*
+ * Was a sec= authflavor specified in the options? First, verify
+ * whether the server supports it, and then just try to use it if so.
+ */
+ if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+ status = nfs_verify_authflavor(args, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
if (status)
return ERR_PTR(status);
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
}
- /* Get a volume representation */
- server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ /*
+ * No sec= option was provided. RFC 2623, section 2.7 suggests we
+ * SHOULD prefer the flavor listed first. However, some servers list
+ * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+ */
+ for (i = 0; i < authlist_len; ++i) {
+ rpc_authflavor_t flavor;
+ struct rpcsec_gss_info info;
+
+ flavor = authlist[i];
+ switch (flavor) {
+ case RPC_AUTH_UNIX:
+ tried_auth_unix = true;
+ break;
+ case RPC_AUTH_NULL:
+ auth_null_in_list = true;
+ continue;
+ default:
+ if (rpcauth_get_gssinfo(flavor, &info) != 0)
+ continue;
+ /* Fallthrough */
+ }
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+ args->auth_flavors[0] = flavor;
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+ if (!IS_ERR(server))
+ return server;
+ }
+
+ /*
+ * Nothing we tried so far worked. At this point, give up if we've
+ * already tried AUTH_UNIX or if the server's list doesn't contain
+ * AUTH_NULL
+ */
+ if (tried_auth_unix || !auth_null_in_list)
+ return server;
+
+ /* Last chance! Try AUTH_UNIX */
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+}
+
+struct dentry *nfs_try_mount(int flags, const char *dev_name,
+ struct nfs_mount_info *mount_info,
+ struct nfs_subversion *nfs_mod)
+{
+ struct nfs_server *server;
+
+ if (mount_info->parsed->need_mount)
+ server = nfs_try_mount_request(mount_info, nfs_mod);
+ else
+ server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+
if (IS_ERR(server))
return ERR_CAST(server);
@@ -2067,6 +2084,8 @@ static int nfs_validate_text_mount_data(void *options,
max_namelen = NFS4_MAXNAMLEN;
max_pathlen = NFS4_MAXPATHLEN;
nfs_validate_transport_protocol(args);
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
nfs4_validate_mount_flags(args);
#else
goto out_v4_not_compiled;
@@ -2089,6 +2108,10 @@ static int nfs_validate_text_mount_data(void *options,
out_v4_not_compiled:
dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
return -EPROTONOSUPPORT;
+#else
+out_invalid_transport_udp:
+ dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+ return -EINVAL;
#endif /* !CONFIG_NFS_V4 */
out_no_address:
@@ -2412,7 +2435,21 @@ static int nfs_bdi_register(struct nfs_server *server)
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
- return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+ int error;
+ unsigned long kflags = 0, kflags_out = 0;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+ error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+ kflags, &kflags_out);
+ if (error)
+ goto err;
+
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+ return error;
}
EXPORT_SYMBOL_GPL(nfs_set_sb_security);
@@ -2447,6 +2484,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
if (server->flags & NFS_MOUNT_NOAC)
sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+ if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
+ if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
if (IS_ERR(s)) {
@@ -2680,6 +2721,8 @@ static int nfs4_validate_mount_data(void *options,
args->acdirmax = data->acdirmax;
args->nfs_server.protocol = data->proto;
nfs_validate_transport_protocol(args);
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
break;
default:
@@ -2700,6 +2743,10 @@ out_inval_auth:
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
+
+out_invalid_transport_udp:
+ dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+ return -EINVAL;
}
/*
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- dentry->d_count);
+ d_count(dentry));
nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
return PageUptodate(page) != 0;
}
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes or if we have a write delegation
+ * from the server then we can just skip the rest of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+{
+ if (file->f_flags & O_DSYNC)
+ return 0;
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ return 1;
+ if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
+ (inode->i_flock->fl_start == 0 &&
+ inode->i_flock->fl_end == OFFSET_MAX &&
+ inode->i_flock->fl_type != F_RDLCK)))
+ return 1;
+ return 0;
+}
+
/*
* Update and possibly write a cached page of an NFS file.
*
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
file->f_path.dentry->d_name.name, count,
(long long)(page_file_offset(page) + offset));
- /* If we're not using byte range locks, and we know the page
- * is up to date, it may be more efficient to extend the write
- * to cover the entire page in order to avoid fragmentation
- * inefficiencies.
- */
- if (nfs_write_pageuptodate(page, inode) &&
- inode->i_flock == NULL &&
- !(file->f_flags & O_DSYNC)) {
+ if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
If unsure, say N.
+config NFSD_V4_SECURITY_LABEL
+ bool "Provide Security Label support for NFSv4 server"
+ depends on NFSD_V4 && SECURITY
+ help
+
+ Say Y here if you want enable fine-grained security label attribute
+ support for NFS version 4. Security labels allow security modules like
+ SELinux and Smack to label files to facilitate enforcement of their policies.
+ Without this an NFSv4 mount will have the same label on each file.
+
+ If you do not wish to enable fine-grained security labels SELinux or
+ Smack policies on NFSv4 files, say N.
+
+ WARNING: there is still a chance of backwards-incompatible protocol changes.
+ For now we recommend "Y" only for developers and testers."
+
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..0d4c410e4589 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
#include "current_stateid.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+ struct inode *inode = resfh->fh_dentry->d_inode;
+ int status;
+
+ mutex_lock(&inode->i_mutex);
+ status = security_inode_setsecctx(resfh->fh_dentry,
+ label->data, label->len);
+ mutex_unlock(&inode->i_mutex);
+
+ if (status)
+ /*
+ * XXX: We should really fail the whole open, but we may
+ * already have created a new file, so it may be too
+ * late. For now this seems the least of evils:
+ */
+ bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
+
#define NFSDDBG_FACILITY NFSDDBG_PROC
static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
+ if (!status && open->op_label.len)
+ nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+
/*
* Following rfc 3530 14.2.16, use the returned bitmask
* to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
accmode = NFSD_MAY_NOP;
- if (open->op_created)
+ if (open->op_created ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
accmode |= NFSD_MAY_OWNER_OVERRIDE;
status = do_open_permission(rqstp, resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
+ if (create->cr_label.len)
+ nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+
if (create->cr_acl != NULL)
do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
setattr->sa_acl);
if (status)
goto out;
+ if (setattr->sa_label.len)
+ status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+ &setattr->sa_label);
+ if (status)
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
out:
@@ -1251,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* According to RFC3010, this takes precedence over all other errors.
*/
status = nfserr_minor_vers_mismatch;
- if (args->minorversion > nfsd_supported_minorversion)
+ if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
goto out;
status = nfs41_check_op_ordering(args);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f17051838b41..1cb621131b00 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
static void free_session(struct nfsd4_session *);
-void nfsd4_put_session(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
{
- atomic_dec(&ses->se_ref);
+ return ses->se_flags & NFS4_SESSION_DEAD;
}
-static bool is_session_dead(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
{
- return ses->se_flags & NFS4_SESSION_DEAD;
+ if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+ free_session(ses);
}
-static __be32 mark_session_dead_locked(struct nfsd4_session *ses)
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
{
- if (atomic_read(&ses->se_ref))
+ if (atomic_read(&ses->se_ref) > ref_held_by_me)
return nfserr_jukebox;
ses->se_flags |= NFS4_SESSION_DEAD;
return nfs_ok;
@@ -281,19 +282,14 @@ static unsigned int file_hashval(struct inode *ino)
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
-static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
-{
- WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
- atomic_inc(&fp->fi_access[oflag]);
-}
-
static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
+ WARN_ON_ONCE(!fp->fi_fds[oflag]);
if (oflag == O_RDWR) {
- __nfs4_file_get_access(fp, O_RDONLY);
- __nfs4_file_get_access(fp, O_WRONLY);
+ atomic_inc(&fp->fi_access[O_RDONLY]);
+ atomic_inc(&fp->fi_access[O_WRONLY]);
} else
- __nfs4_file_get_access(fp, oflag);
+ atomic_inc(&fp->fi_access[oflag]);
}
static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
@@ -364,19 +360,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
{
struct nfs4_delegation *dp;
struct nfs4_file *fp = stp->st_file;
dprintk("NFSD alloc_init_deleg\n");
- /*
- * Major work on the lease subsystem (for example, to support
- * calbacks on stat) will be required before we can support
- * write delegations properly.
- */
- if (type != NFS4_OPEN_DELEGATE_READ)
- return NULL;
if (fp->fi_had_conflict)
return NULL;
if (num_delegations > max_delegations)
@@ -397,7 +386,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
INIT_LIST_HEAD(&dp->dl_recall_lru);
get_nfs4_file(fp);
dp->dl_file = fp;
- dp->dl_type = type;
+ dp->dl_type = NFS4_OPEN_DELEGATE_READ;
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1177,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
target->cr_gid = source->cr_gid;
target->cr_group_info = source->cr_group_info;
get_group_info(target->cr_group_info);
+ target->cr_gss_mech = source->cr_gss_mech;
+ if (source->cr_gss_mech)
+ gss_mech_get(source->cr_gss_mech);
return 0;
}
@@ -1262,6 +1254,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
}
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+ u32 service;
+
+ service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+ return service == RPC_GSS_SVC_INTEGRITY ||
+ service == RPC_GSS_SVC_PRIVACY;
+}
+
+static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+
+ if (!cl->cl_mach_cred)
+ return true;
+ if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+ return false;
+ if (!svc_rqst_integrity_protected(rqstp))
+ return false;
+ if (!cr->cr_principal)
+ return false;
+ return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
+
static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
static u32 current_clientid = 1;
@@ -1639,16 +1656,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
- /* Currently only support SP4_NONE */
switch (exid->spa_how) {
+ case SP4_MACH_CRED:
+ if (!svc_rqst_integrity_protected(rqstp))
+ return nfserr_inval;
case SP4_NONE:
break;
default: /* checked by xdr code */
WARN_ON_ONCE(1);
case SP4_SSV:
return nfserr_encr_alg_unsupp;
- case SP4_MACH_CRED:
- return nfserr_serverfault; /* no excuse :-/ */
}
/* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1680,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_inval;
goto out;
}
+ if (!mach_creds_match(conf, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
if (!creds_match) { /* case 9 */
status = nfserr_perm;
goto out;
@@ -1709,7 +1730,8 @@ out_new:
status = nfserr_jukebox;
goto out;
}
- new->cl_minorversion = 1;
+ new->cl_minorversion = cstate->minorversion;
+ new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -1839,6 +1861,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
return nfs_ok;
}
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+ switch (cbs->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ return nfs_ok;
+ default:
+ /*
+ * GSS case: the spec doesn't allow us to return this
+ * error. But it also doesn't allow us not to support
+ * GSS.
+ * I'd rather this fail hard than return some error the
+ * client might think it can already handle:
+ */
+ return nfserr_encr_alg_unsupp;
+ }
+}
+
__be32
nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1854,6 +1894,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
+ status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+ if (status)
+ return status;
status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
if (status)
return status;
@@ -1874,6 +1917,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
WARN_ON_ONCE(conf && unconf);
if (conf) {
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(conf, rqstp))
+ goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
@@ -1890,6 +1936,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
goto out_free_conn;
}
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(unconf, rqstp))
+ goto out_free_conn;
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
@@ -1957,7 +2006,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
{
struct nfsd4_session *session = cstate->session;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+ status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+ if (status)
+ return status;
spin_lock(&nn->client_lock);
session->se_cb_prog = bc->bc_cb_program;
session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2039,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
status = nfserr_badsession;
if (!session)
goto out;
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(session->se_client, rqstp))
+ goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
goto out;
@@ -2014,6 +2070,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
{
struct nfsd4_session *ses;
__be32 status;
+ int ref_held_by_me = 0;
struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
nfs4_lock_state();
@@ -2021,6 +2078,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
if (!nfsd4_last_compound_op(r))
goto out;
+ ref_held_by_me++;
}
dump_sessionid(__func__, &sessionid->sessionid);
spin_lock(&nn->client_lock);
@@ -2028,17 +2086,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
status = nfserr_badsession;
if (!ses)
goto out_client_lock;
- status = mark_session_dead_locked(ses);
- if (status)
+ status = nfserr_wrong_cred;
+ if (!mach_creds_match(ses->se_client, r))
goto out_client_lock;
+ nfsd4_get_session_locked(ses);
+ status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+ if (status)
+ goto out_put_session;
unhash_session(ses);
spin_unlock(&nn->client_lock);
nfsd4_probe_callback_sync(ses->se_client);
spin_lock(&nn->client_lock);
- free_session(ses);
status = nfs_ok;
+out_put_session:
+ nfsd4_put_session(ses);
out_client_lock:
spin_unlock(&nn->client_lock);
out:
@@ -2058,26 +2121,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
return NULL;
}
-static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
{
struct nfs4_client *clp = ses->se_client;
struct nfsd4_conn *c;
+ __be32 status = nfs_ok;
int ret;
spin_lock(&clp->cl_lock);
c = __nfsd4_find_conn(new->cn_xprt, ses);
- if (c) {
- spin_unlock(&clp->cl_lock);
- free_conn(new);
- return;
- }
+ if (c)
+ goto out_free;
+ status = nfserr_conn_not_bound_to_session;
+ if (clp->cl_mach_cred)
+ goto out_free;
__nfsd4_hash_conn(new, ses);
spin_unlock(&clp->cl_lock);
ret = nfsd4_register_conn(new);
if (ret)
/* oops; xprt is already down: */
nfsd4_conn_lost(&new->cn_xpt_user);
- return;
+ return nfs_ok;
+out_free:
+ spin_unlock(&clp->cl_lock);
+ free_conn(new);
+ return status;
}
static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2237,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (status)
goto out_put_session;
- nfsd4_sequence_check_conn(conn, session);
+ status = nfsd4_sequence_check_conn(conn, session);
conn = NULL;
+ if (status)
+ goto out_put_session;
/* Success! bump slot seqid */
slot->sl_seqid = seq->seqid;
@@ -2232,7 +2302,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfserr_stale_clientid;
goto out;
}
-
+ if (!mach_creds_match(clp, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
expire_client(clp);
out:
nfs4_unlock_state();
@@ -2940,13 +3013,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
return fl;
}
-static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
struct file_lock *fl;
int status;
- fl = nfs4_alloc_init_lease(dp, flag);
+ fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3037,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
return 0;
}
-static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+static int nfs4_set_delegation(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_file;
if (!fp->fi_lease)
- return nfs4_setlease(dp, flag);
+ return nfs4_setlease(dp);
spin_lock(&recall_lock);
if (fp->fi_had_conflict) {
spin_unlock(&recall_lock);
@@ -3005,6 +3078,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
/*
* Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
*/
static void
nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3089,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
int cb_up;
- int status = 0, flag = 0;
+ int status = 0;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
- flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
open->op_recall = 1;
- flag = open->op_delegate_type;
- if (flag == NFS4_OPEN_DELEGATE_NONE)
- goto out;
+ if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
+ goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
- /* Let's not give out any delegations till everyone's
- * had the chance to reclaim theirs.... */
+ /*
+ * Let's not give out any delegations till everyone's
+ * had the chance to reclaim theirs....
+ */
if (locks_in_grace(net))
- goto out;
+ goto out_no_deleg;
if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
- goto out;
+ goto out_no_deleg;
+ /*
+ * Also, if the file was opened for write or
+ * create, there's a good chance the client's
+ * about to write to it, resulting in an
+ * immediate recall (since we don't support
+ * write delegations):
+ */
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
- flag = NFS4_OPEN_DELEGATE_WRITE;
- else
- flag = NFS4_OPEN_DELEGATE_READ;
+ goto out_no_deleg;
+ if (open->op_create == NFS4_OPEN_CREATE)
+ goto out_no_deleg;
break;
default:
- goto out;
+ goto out_no_deleg;
}
-
- dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
+ dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
if (dp == NULL)
goto out_no_deleg;
- status = nfs4_set_delegation(dp, flag);
+ status = nfs4_set_delegation(dp);
if (status)
goto out_free;
@@ -3053,24 +3135,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
STATEID_VAL(&dp->dl_stid.sc_stateid));
-out:
- open->op_delegate_type = flag;
- if (flag == NFS4_OPEN_DELEGATE_NONE) {
- if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
- open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
- dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-
- /* 4.1 client asking for a delegation? */
- if (open->op_deleg_want)
- nfsd4_open_deleg_none_ext(open, status);
- }
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
return;
out_free:
unhash_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
out_no_deleg:
- flag = NFS4_OPEN_DELEGATE_NONE;
- goto out;
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+ open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+ dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+ open->op_recall = 1;
+ }
+
+ /* 4.1 client asking for a delegation? */
+ if (open->op_deleg_want)
+ nfsd4_open_deleg_none_ext(open, status);
+ return;
}
static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3508,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
/* Returns true iff a is later than b: */
static bool stateid_generation_after(stateid_t *a, stateid_t *b)
{
- return (s32)a->si_generation - (s32)b->si_generation > 0;
+ return (s32)(a->si_generation - b->si_generation) > 0;
}
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4516,6 @@ __be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_locku *locku)
{
- struct nfs4_lockowner *lo;
struct nfs4_ol_stateid *stp;
struct file *filp = NULL;
struct file_lock *file_lock = NULL;
@@ -4468,10 +4548,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_jukebox;
goto out;
}
- lo = lockowner(stp->st_stateowner);
locks_init_lock(file_lock);
file_lock->fl_type = F_UNLCK;
- file_lock->fl_owner = (fl_owner_t)lo;
+ file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
file_lock->fl_pid = current->tgid;
file_lock->fl_file = filp;
file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4569,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
- WARN_ON_ONCE(cstate->replay_owner);
- release_lockowner(lo);
- }
-
out:
nfsd4_bump_seqid(cstate, status);
if (!cstate->replay_owner)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..0c0f3ea90de5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
#include "cache.h"
#include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
+
+
#define NFSDDBG_FACILITY NFSDDBG_XDR
/*
@@ -134,6 +139,19 @@ xdr_error: \
} \
} while (0)
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+ argp->pagelist++;
+ argp->p = page_address(argp->pagelist[0]);
+ if (argp->pagelen < PAGE_SIZE) {
+ argp->end = argp->p + (argp->pagelen>>2);
+ argp->pagelen = 0;
+ } else {
+ argp->end = argp->p + (PAGE_SIZE>>2);
+ argp->pagelen -= PAGE_SIZE;
+ }
+}
+
static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
* guarantee p points to at least nbytes bytes.
*/
memcpy(p, argp->p, avail);
- /* step to next page */
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + (argp->pagelen>>2);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
+ next_decode_page(argp);
memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
argp->p += XDR_QUADLEN(nbytes - avail);
return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
static __be32
nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
- struct iattr *iattr, struct nfs4_acl **acl)
+ struct iattr *iattr, struct nfs4_acl **acl,
+ struct xdr_netobj *label)
{
int expected_len, len = 0;
u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
goto xdr_error;
}
}
+
+ label->len = 0;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* lfs: we don't use it */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32); /* pi: we don't use it either */
+ READ_BUF(4);
+ len += 4;
+ READ32(dummy32);
+ READ_BUF(dummy32);
+ if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+ return nfserr_badlabel;
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+ if (!label->data)
+ return nfserr_jukebox;
+ defer_free(argp, kfree, label->data);
+ memcpy(label->data, buf, dummy32);
+ }
+#endif
+
if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
|| bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
|| bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
/* callback_sec_params4 */
READ_BUF(4);
READ32(nr_secflavs);
- cbs->flavor = (u32)(-1);
+ if (nr_secflavs)
+ cbs->flavor = (u32)(-1);
+ else
+ /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+ cbs->flavor = 0;
for (i = 0; i < nr_secflavs; ++i) {
READ_BUF(4);
READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
- &create->cr_acl);
+ &create->cr_acl, &create->cr_label);
if (status)
goto out;
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl);
+ &open->op_iattr, &open->op_acl, &open->op_label);
if (status)
goto out;
break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
if (status)
return status;
return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
- &setattr->sa_acl);
+ &setattr->sa_acl, &setattr->sa_label);
}
static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
+ [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
};
static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{
+ __be32 *p = *pp;
+
+ if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+ return nfserr_resource;
+
+ /*
+ * For now we use a 0 here to indicate the null translation; in
+ * the future we may place a call to translation code here.
+ */
+ if ((*buflen -= 8) < 0)
+ return nfserr_resource;
+
+ WRITE32(0); /* lfs */
+ WRITE32(0); /* pi */
+ p = xdr_encode_opaque(p, context, len);
+ *buflen -= (XDR_QUADLEN(len) << 2) + 4;
+
+ *pp = p;
+ return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{ return 0; }
+#endif
+
static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
{
/* As per referral draft: */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
int err;
int aclsupport = 0;
struct nfs4_acl *acl = NULL;
+ void *context = NULL;
+ int contextlen;
+ bool contextsupport = false;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
}
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ err = security_inode_getsecctx(dentry->d_inode,
+ &context, &contextlen);
+ contextsupport = (err == 0);
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ if (err == -EOPNOTSUPP)
+ bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ else if (err)
+ goto out_nfserr;
+ }
+ }
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+
if (bmval2) {
if ((buflen -= 16) < 0)
goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
+ if (!contextsupport)
+ word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
if (!word2) {
if ((buflen -= 12) < 0)
goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
get_parent_attributes(exp, &stat);
WRITE64(stat.ino);
}
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ status = nfsd4_encode_security_label(rqstp, context,
+ contextlen, &p, &buflen);
+ if (status)
+ goto out;
+ }
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
status = nfs_ok;
out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (context)
+ security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
kfree(acl);
if (fhp == &tempfh)
fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
{
__be32 *p;
- RESERVE_SPACE(12);
+ RESERVE_SPACE(16);
if (nfserr) {
- WRITE32(2);
+ WRITE32(3);
+ WRITE32(0);
WRITE32(0);
WRITE32(0);
}
else {
- WRITE32(2);
+ WRITE32(3);
WRITE32(setattr->sa_bmval[0]);
WRITE32(setattr->sa_bmval[1]);
+ WRITE32(setattr->sa_bmval[2]);
}
ADJUST_ARGS();
return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
return nfserr;
}
+static const u32 nfs4_minimal_spo_must_enforce[2] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+};
+
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
@@ -3264,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
/* state_protect4_r. Currently only support SP4_NONE */
BUG_ON(exid->spa_how != SP4_NONE);
WRITE32(exid->spa_how);
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce bitmap: */
+ WRITE32(2);
+ WRITE32(nfs4_minimal_spo_must_enforce[0]);
+ WRITE32(nfs4_minimal_spo_must_enforce[1]);
+ /* empty spo_must_allow bitmap: */
+ WRITE32(0);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
/* The server_owner struct */
WRITE64(minor_id); /* Minor id */
@@ -3635,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
if (nfsd4_has_session(cs)) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_client *clp = cs->session->se_client;
if (cs->status != nfserr_replay_cache) {
nfsd4_store_cache_entry(resp);
cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
}
/* Renew the clientid on success and on replay */
- put_client_renew(cs->session->se_client);
+ spin_lock(&nn->client_lock);
nfsd4_put_session(cs->session);
+ spin_unlock(&nn->client_lock);
+ put_client_renew(clp);
}
return 1;
}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
/*
* nfsd version
*/
-#define NFSD_SUPPORTED_MINOR_VERSION 1
+#define NFSD_SUPPORTED_MINOR_VERSION 2
/*
* Maximum blocksizes supported by daemon under various circumstances.
*/
@@ -53,7 +53,6 @@ struct readdir_cd {
extern struct svc_program nfsd_program;
extern struct svc_version nfsd_version2, nfsd_version3,
nfsd_version4;
-extern u32 nfsd_supported_minorversion;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
@@ -243,6 +242,12 @@ void nfsd_lockd_shutdown(void);
#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
/* error codes for internal use */
/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +327,13 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#else
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#endif
+
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +348,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
static inline u32 nfsd_suppattrs2(u32 minorversion)
{
- return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
- : NFSD4_SUPPORTED_ATTRS_WORD2;
+ switch (minorversion) {
+ default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
+ case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
+ case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
+ }
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +365,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#else
#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#endif
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program nfsd_program = {
};
-u32 nfsd_supported_minorversion;
+static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
+ [0] = 1,
+ [1] = 1,
+};
int nfsd_vers(int vers, enum vers_op change)
{
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
return -1;
switch(change) {
case NFSD_SET:
- nfsd_supported_minorversion = minorversion;
+ nfsd_supported_minorversions[minorversion] = true;
break;
case NFSD_CLEAR:
- if (minorversion == 0)
- return -1;
- nfsd_supported_minorversion = minorversion - 1;
+ nfsd_supported_minorversions[minorversion] = false;
break;
case NFSD_TEST:
- return minorversion <= nfsd_supported_minorversion;
+ return nfsd_supported_minorversions[minorversion];
case NFSD_AVAIL:
return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
+ bool cl_mach_cred; /* SP4_MACH_CRED in force */
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a6bc8a7423db..c827acb0e943 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
#include <asm/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
+#include <linux/security.h>
#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
return 1;
}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+
+ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+
+ mutex_lock(&dentry->d_inode->i_mutex);
+ host_error = security_inode_setsecctx(dentry, label->data, label->len);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ return nfserr_notsupp;
+}
+#endif
+
#endif /* defined(CONFIG_NFSD_V4) */
#ifdef CONFIG_NFSD_V3
@@ -802,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
flags = O_WRONLY|O_LARGEFILE;
}
*filp = dentry_open(&path, flags, current_cred());
- if (IS_ERR(*filp))
+ if (IS_ERR(*filp)) {
host_err = PTR_ERR(*filp);
- else {
+ *filp = NULL;
+ } else {
host_err = ima_file_check(*filp, may_flags);
if (may_flags & NFSD_MAY_64BIT_COOKIE)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
/* nfsd/vfs.c */
-int fh_lock_parent(struct svc_fh *, struct dentry *);
int nfsd_racache_init(int);
void nfsd_racache_shutdown(void);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
struct nfs4_acl *);
int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+ struct xdr_netobj *);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
struct svc_fh *, char *, int);
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
-int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
- unsigned long size);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t *, struct readdir_cd *, filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
-int nfsd_notify_change(struct inode *, struct iattr *);
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
-int nfsd_sync_dir(struct dentry *dp);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
#include "state.h"
#include "nfsd.h"
+#define NFSD4_MAX_SEC_LABEL_LEN 2048
#define NFSD4_MAX_TAGLEN 128
#define XDR_LEN(n) (((n) + 3) & ~3)
@@ -118,6 +119,7 @@ struct nfsd4_create {
struct iattr cr_iattr; /* request */
struct nfsd4_change_info cr_cinfo; /* response */
struct nfs4_acl *cr_acl;
+ struct xdr_netobj cr_label;
};
#define cr_linklen u.link.namelen
#define cr_linkname u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_acl *op_acl;
+ struct xdr_netobj op_label;
};
#define op_iattr iattr
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
u32 sa_bmval[3]; /* request */
struct iattr sa_iattr; /* request */
struct nfs4_acl *sa_acl;
+ struct xdr_netobj sa_label;
};
struct nfsd4_setclientid {
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
}
/**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+ unsigned long *desc_blocks)
+{
+ unsigned long blknum;
+ int ret;
+
+ ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+ if (likely(!ret))
+ *desc_blocks = DIV_ROUND_UP(
+ blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+ return ret;
+}
+
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ * MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+ unsigned long desc_blocks)
+{
+ return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+ nilfs_palloc_groups_count(inode);
+}
+
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ * described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+ unsigned long desc_blocks = 0;
+ u64 entries_per_desc_block, nmax;
+ int err;
+
+ err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+ if (unlikely(err))
+ return err;
+
+ entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+ nilfs_palloc_groups_per_desc_block(inode);
+ nmax = entries_per_desc_block * desc_blocks;
+
+ if (nused == nmax &&
+ nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+ nmax += entries_per_desc_block;
+
+ if (nused > nmax)
+ return -ERANGE;
+
+ *nmaxp = nmax;
+ return 0;
+}
+
+/**
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
const struct buffer_head *, void *);
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
+
/**
* nilfs_palloc_req - persistent allocator request and reply
* @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
}
/**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+ u64 *nmaxinodes, u64 *nfreeinodes)
+{
+ u64 nused;
+ int err;
+
+ *nmaxinodes = 0;
+ *nfreeinodes = 0;
+
+ nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
+ err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+ if (likely(!err))
+ *nfreeinodes = *nmaxinodes - nused;
+ return err;
+}
+
+/**
* nilfs_ifile_read - read or get ifile inode
* @sb: super block instance
* @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
int nilfs_ifile_delete_inode(struct inode *, ino_t);
int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
+
int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
size_t inode_size, struct nilfs_inode *raw_inode,
struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
- atomic_add(n, &root->blocks_count);
+ atomic64_add(n, &root->blocks_count);
}
void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
- atomic_sub(n, &root->blocks_count);
+ atomic64_sub(n, &root->blocks_count);
}
/**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
- atomic_inc(&root->inodes_count);
+ atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
if (!ret)
- atomic_dec(&ii->i_root->inodes_count);
+ atomic64_dec(&ii->i_root->inodes_count);
nilfs_clear_inode(inode);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
raw_cp->cp_snapshot_list.ssl_next = 0;
raw_cp->cp_snapshot_list.ssl_prev = 0;
raw_cp->cp_inodes_count =
- cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+ cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
raw_cp->cp_blocks_count =
- cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+ cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
raw_cp->cp_nblk_inc =
cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
if (err)
goto failed_bh;
- atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
- atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+ atomic64_set(&root->inodes_count,
+ le64_to_cpu(raw_cp->cp_inodes_count));
+ atomic64_set(&root->blocks_count,
+ le64_to_cpu(raw_cp->cp_blocks_count));
nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
unsigned long overhead;
unsigned long nrsvblocks;
sector_t nfreeblocks;
+ u64 nmaxinodes, nfreeinodes;
int err;
/*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (unlikely(err))
return err;
+ err = nilfs_ifile_count_free_inodes(root->ifile,
+ &nmaxinodes, &nfreeinodes);
+ if (unlikely(err)) {
+ printk(KERN_WARNING
+ "NILFS warning: fail to count free inodes: err %d.\n",
+ err);
+ if (err == -ERANGE) {
+ /*
+ * If nilfs_palloc_count_max_entries() returns
+ * -ERANGE error code then we simply treat
+ * curent inodes count as maximum possible and
+ * zero as free inodes value.
+ */
+ nmaxinodes = atomic64_read(&root->inodes_count);
+ nfreeinodes = 0;
+ err = 0;
+ } else
+ return err;
+ }
+
buf->f_type = NILFS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = blocks - overhead;
buf->f_bfree = nfreeblocks;
buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
(buf->f_bfree - nrsvblocks) : 0;
- buf->f_files = atomic_read(&root->inodes_count);
- buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+ buf->f_files = nmaxinodes;
+ buf->f_ffree = nfreeinodes;
buf->f_namelen = NILFS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
static int nilfs_tree_was_touched(struct dentry *root_dentry)
{
- return root_dentry->d_count > 1;
+ return d_count(root_dentry) > 1;
}
/**
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
new->ifile = NULL;
new->nilfs = nilfs;
atomic_set(&new->count, 1);
- atomic_set(&new->inodes_count, 0);
- atomic_set(&new->blocks_count, 0);
+ atomic64_set(&new->inodes_count, 0);
+ atomic64_set(&new->blocks_count, 0);
rb_link_node(&new->rb_node, parent, p);
rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
struct the_nilfs *nilfs;
struct inode *ifile;
- atomic_t inodes_count;
- atomic_t blocks_count;
+ atomic64_t inodes_count;
+ atomic64_t blocks_count;
};
/* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
spin_unlock(&fsn_mark->lock);
- /* nothing else could have found us thanks to the dnotify_mark_mutex */
+ /* nothing else could have found us thanks to the dnotify_groups
+ mark_mutex */
if (dn_mark->dn == NULL)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_mark_mutex);
+ mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
- fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+ fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+ NULL, 0);
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
- * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
- * only time we clean up the marks we need to get our mark off
- * the list. */
+ * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
+ * fd is the only time we clean up the marks we need to get our mark
+ * off the list. */
if (f != filp) {
/* if we added ourselves, shoot ourselves, it's possible that
* the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark(fsn_mark, dnotify_group);
+ fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
- mutex_unlock(&dnotify_mark_mutex);
+ mutex_unlock(&dnotify_group->mark_mutex);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1ea52f7c031f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
metadata->event_len = FAN_EVENT_METADATA_LEN;
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
+ metadata->reserved = 0;
metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -523,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -547,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
- if (!fsn_mark)
+ if (!fsn_mark) {
+ mutex_unlock(&group->mark_mutex);
return -ENOENT;
+ }
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark(fsn_mark, group);
+ fsnotify_destroy_mark_locked(fsn_mark, group);
+ mutex_unlock(&group->mark_mutex);
+
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
if (removed & inode->i_fsnotify_mask)
@@ -590,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
return mask & ~oldmask;
}
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+ struct inode *inode,
+ struct vfsmount *mnt)
+{
+ struct fsnotify_mark *mark;
+ int ret;
+
+ if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+ return ERR_PTR(-ENOSPC);
+
+ mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+ if (!mark)
+ return ERR_PTR(-ENOMEM);
+
+ fsnotify_init_mark(mark, fanotify_free_mark);
+ ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+ if (ret) {
+ fsnotify_put_mark(mark);
+ return ERR_PTR(ret);
+ }
+
+ return mark;
+}
+
+
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
unsigned int flags)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
fsnotify_recalc_vfsmount_mask(mnt);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -627,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
- int ret = 0;
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -641,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
(atomic_read(&inode->i_writecount) > 0))
return 0;
+ mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_inode_mark(group, inode);
if (!fsn_mark) {
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
- return -ENOSPC;
-
- fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!fsn_mark)
- return -ENOMEM;
-
- fsnotify_init_mark(fsn_mark, fanotify_free_mark);
- ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
- if (ret)
- goto err;
+ fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+ if (IS_ERR(fsn_mark)) {
+ mutex_unlock(&group->mark_mutex);
+ return PTR_ERR(fsn_mark);
+ }
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+ mutex_unlock(&group->mark_mutex);
if (added & ~inode->i_fsnotify_mask)
fsnotify_recalc_inode_mask(inode);
-err:
+
fsnotify_put_mark(fsn_mark);
- return ret;
+ return 0;
}
/* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
goto out_err;
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+ NULL, 0);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
-retry:
+ mutex_lock(&group->mark_mutex);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- /*
- * inotify_new_watch could race with another thread which did an
- * inotify_new_watch between the update_existing and the add watch
- * here, go back and try to update an existing mark again.
- */
- if (ret == -EEXIST)
- goto retry;
+ mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
* fsnotify inode mark locking/lifetime/and refcnting
*
* REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object. The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
*
* LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
*
+ * group->mark_mutex
* mark->lock
- * group->mark_lock
* inode->i_lock
*
- * mark->lock protects 2 things, mark->group and mark->inode. You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list. It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list. It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
*
* inode->i_lock protects the i_fsnotify_marks list anchored inside a
* given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
* inode. We take i_lock and walk the i_fsnotify_marks safely. For each
* mark on the list we take a reference (so the mark can't disappear under us).
* We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode. For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode. If
- * we see the group and inode are not NULL we take those locks. Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future. Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref. We drop our reference we took before we unhooked it
- * from the inode. When the ref hits 0 we can free the mark.
- *
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
* Very similarly for freeing by group, except we use free_g_list.
*
* This has the very interesting property of being able to run concurrently with
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0);
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
&ref_tree, NULL);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto bail;
}
ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
&extra_blocks);
if (ret < 0) {
mlog_errno(ret);
- goto out;
+ goto bail;
}
}
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
extra_blocks);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
mutex_unlock(&tl_inode->i_mutex);
-
+bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
* from ext3. PageChecked() bits have been removed as OCFS2 does not
* do journalled data.
*/
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- jbd2_journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset, length);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5b62c95a6f5d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -35,6 +35,7 @@
#include <linux/time.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
+#include <linux/bitmap.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -176,7 +177,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
-static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
{
int ret = -1;
@@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
-static int o2hb_pop_count(void *map, int count)
-{
- int i = -1, pop = 0;
-
- while ((i = find_next_bit(map, count, i + 1)) < count)
- pop++;
- return pop;
-}
-
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
@@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work)
spin_lock_irqsave(&o2hb_live_lock, flags);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
- failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
- quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
spin_unlock_irqrestore(&o2hb_live_lock, flags);
@@ -500,7 +492,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE, bio);
+ submit_bio(WRITE_SYNC, bio);
status = 0;
bail:
@@ -771,7 +763,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* If global heartbeat active, unpin all regions if the
* region count > CUT_OFF
*/
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
unlock:
@@ -956,23 +948,9 @@ out:
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
-
- return highest;
+ return find_last_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
@@ -1831,7 +1809,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold = O2HB_LIVE_THRESHOLD;
if (o2hb_global_heartbeat_active()) {
spin_lock(&o2hb_live_lock);
- if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
@@ -2182,7 +2160,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (!o2hb_dependent_users)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
o2hb_region_pin(NULL);
@@ -2271,7 +2249,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
continue;
- ret = o2hb_global_hearbeat_mode_set(i);
+ ret = o2hb_global_heartbeat_mode_set(i);
if (!ret)
printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2282,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
NULL,
};
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
.show_attribute = o2hb_heartbeat_group_show,
.store_attribute = o2hb_heartbeat_group_store,
};
@@ -2316,7 +2294,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_hearbeat_group_item_ops,
+ .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -2389,6 +2367,9 @@ static int o2hb_region_pin(const char *region_uuid)
assert_spin_locked(&o2hb_live_lock);
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
uuid = config_item_name(&reg->hr_item);
/* local heartbeat */
@@ -2439,6 +2420,9 @@ static void o2hb_region_unpin(const char *region_uuid)
assert_spin_locked(&o2hb_live_lock);
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
uuid = config_item_name(&reg->hr_item);
if (region_uuid) {
if (strcmp(region_uuid, uuid))
@@ -2476,7 +2460,7 @@ static int o2hb_region_inc_user(const char *region_uuid)
if (o2hb_dependent_users > 1)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
ret = o2hb_region_pin(NULL);
@@ -2654,6 +2638,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
p = region_uuids;
list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
if (numregs < max_regions) {
memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index baa2b9ef7eef..2260fb9e6508 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
sc->sc_node = NULL;
o2net_debug_del_sc(sc);
+
+ if (sc->sc_page)
+ __free_page(sc->sc_page);
kfree(sc);
}
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
state_change = sc->sc_state_change;
switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- o2net_sc_queue_work(sc, &sc->sc_connect_work);
- break;
- default:
- printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
- " shutdown, state %d\n",
- SC_NODEF_ARGS(sc), sk->sk_state);
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
- break;
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
}
out:
read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
lock->ml.node);
}
} else {
+ status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 33ecbe0e6734..3d09a940c015 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1888,8 +1888,10 @@ ok:
* up nodes that this node contacted */
while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -2357,6 +2359,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
+
if (res->owner != dlm->node_num)
return 0;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..317c0d4024d8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
static int dlm_do_recovery(struct dlm_ctxt *dlm);
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
u8 dead_node)
{
struct dlm_lock_request lr;
- enum dlm_status ret;
+ int ret;
mlog(0, "\n");
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
lr.dead_node = dead_node;
// send message
- ret = DLM_NOLOCKMGR;
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
&lr, sizeof(lr), request_from, NULL);
@@ -1886,6 +1882,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
@@ -2696,6 +2699,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
spin_unlock(&dlm->spinlock);
+ dlm_put(dlm);
return -EAGAIN;
}
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..96f9ac237e86 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
- atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
@@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
- ocfs2_quota_trans_credits(sb);
+ ocfs2_quota_trans_credits(sb) + bits_wanted;
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
- le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
return ret;
}
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
{
int status;
int child_locked = 0;
+ bool is_unlinkable = false;
struct inode *inode = dentry->d_inode;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- if (inode_is_unlinkable(inode)) {
+ if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
goto leave;
}
+ is_unlinkable = true;
}
handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (inode_is_unlinkable(inode)) {
- status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- }
-
/* delete the name from the parent dir */
status = ocfs2_delete_entry(handle, dir, &lookup);
if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
if (S_ISDIR(inode->i_mode))
inc_nlink(dir);
+ goto leave;
+ }
+
+ if (is_unlinkable) {
+ status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+ orphan_name, &orphan_insert, orphan_dir);
+ if (status < 0)
+ mlog_errno(status);
}
leave:
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto leave;
}
+ /*
+ * We're going to journal the change of i_flags and i_orphaned_slot.
+ * It's safe anyway, though some callers may duplicate the journaling.
+ * Journaling within the func just make the logic look more
+ * straightforward.
+ */
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* we're a cluster, and nlink can change on disk from
* underneath us... */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_dir_bh, lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
- }
-
- /*
- * We're going to journal the change of i_flags and i_orphaned_slot.
- * It's safe anyway, though some callers may duplicate the journaling.
- * Journaling within the func just make the logic look more
- * straightforward.
- */
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(inode),
- fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ goto rollback;
}
- le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
/* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
osb->slot_num);
+rollback:
+ if (status < 0) {
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ }
+
leave:
brelse(orphan_dir_bh);
- if (status)
- mlog_errno(status);
return status;
}
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+ di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
di->i_orphaned_slot = 0;
set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
- atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..9f6b96a09615 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2965,6 +2965,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
to = map_end & (PAGE_CACHE_SIZE - 1);
page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
/*
* In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
int status;
/* there is a really tiny chance the journal calls could fail,
* but we wouldn't want inconsistent blocks in *any* case. */
- u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ u64 bg_ptr, prev_bg_ptr;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
(unsigned long long)le64_to_cpu(bg->bg_blkno),
(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
- fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
prev_bg_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out;
prev_bg->bg_next_group = bg->bg_next_group;
ocfs2_journal_dirty(handle, prev_bg_bh);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_prev_bg;
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
ocfs2_journal_dirty(handle, bg_bh);
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_bg;
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
ocfs2_journal_dirty(handle, fe_bh);
-out_rollback:
- if (status < 0) {
- fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
- bg->bg_next_group = cpu_to_le64(bg_ptr);
- prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
- }
-
- if (status)
+out:
+ if (status < 0)
mlog_errno(status);
return status;
+
+out_rollback_bg:
+ bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+ prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+ goto out;
}
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_unlock(&osb->osb_lock);
out += snprintf(buf + out, len - out,
- "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
+ "%10s => Pid: %d Interval: %lu\n", "Commit",
(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
- osb->osb_commit_interval,
- atomic_read(&osb->needs_checkpoint));
+ osb->osb_commit_interval);
out += snprintf(buf + out, len - out,
"%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
init_waitqueue_head(&osb->checkpoint_event);
- atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
{
int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
struct ocfs2_xa_loc loc;
if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
down_write(&oi->ip_alloc_sem);
if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
- if (!ocfs2_xattr_has_space_inline(inode, di)) {
- ret = -ENOSPC;
- goto out;
- }
- }
-
- if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
if (ret) {
if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
}
new_oi = OCFS2_I(args->new_inode);
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
spin_lock(&new_oi->ip_lock);
new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/open.c b/fs/open.c
index fca72c4d3f17..7931f76acc2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,7 +823,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
int lookup_flags = 0;
int acc_mode;
- if (flags & O_CREAT)
+ if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
op->mode = 0;
@@ -840,10 +840,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_SYNC)
flags |= O_DSYNC;
- if (flags & O_TMPFILE) {
- if (!(flags & O_CREAT))
+ if (flags & __O_TMPFILE) {
+ if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
acc_mode = MAY_OPEN | ACC_MODE(flags);
+ if (!(acc_mode & MAY_WRITE))
+ return -EINVAL;
} else if (flags & O_PATH) {
/*
* If we have O_PATH in the open flag. Then we
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
prpsinfo.pr_zomb = 0;
strcpy(prpsinfo.pr_fname, "vmlinux");
- strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+ strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
nhdr->p_filesz += notesize(&notes[1]);
bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
.release = seq_release_private,
};
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ * but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ * these flag is set to denote, that user is aware of the
+ * new API and those page-shift bits change their meaning.
+ * The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ * of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/vm/soft-dirty.txt for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(ptent))
continue;
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
- int type;
+ enum clear_refs_types type;
+ int itype;
int rv;
memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- rv = kstrtoint(strstrip(buffer), 10, &type);
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
if (rv < 0)
return rv;
- if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
+
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ soft_dirty_cleared = true;
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+ "See the linux/Documentation/vm/pagemap.txt for details.\n");
+ }
+
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct clear_refs_private cp = {
+ .type = type,
+ };
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
+ .private = &cp,
};
down_read(&mm->mmap_sem);
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_start(mm, 0, -1);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- clear_refs_walk.private = vma;
+ cp.vma = vma;
if (is_vm_hugetlb_page(vma))
continue;
/*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
struct pagemapread {
int pos, len;
pagemap_entry_t *buffer;
+ bool v2;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
u64 frame, flags;
struct page *page = NULL;
+ int flags2 = 0;
if (pte_present(pte)) {
frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+ *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
/*
* Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
- pmd_t pmd, int offset)
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
}
#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ int pmd_flags2;
+
+ pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* and need a new, higher one */
if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
- pme = make_pme(PM_NOT_PRESENT);
+ pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, vma, addr, *pte);
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pte_t pte, int offset)
{
if (pte_present(pte))
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT);
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!count)
goto out_task;
+ pm.v2 = soft_dirty_cleared;
pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
return ret;
}
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+ pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+ "to stop being page-shift some time soon. See the "
+ "linux/Documentation/vm/pagemap.txt for details.\n");
+ return 0;
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
+ .open = pagemap_open,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
for_each_possible_cpu(i)
idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
- do_posix_clock_monotonic_gettime(&uptime);
- monotonic_to_bootbased(&uptime);
+ get_monotonic_boottime(&uptime);
nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
/* Stores the pointer to the buffer containing kernel elf core headers. */
static char *elfcorebuf;
static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
return read;
}
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
- struct vmcore **m_ptr)
-{
- struct vmcore *m;
- u64 paddr;
-
- list_for_each_entry(m, vc_list, list) {
- u64 start, end;
- start = m->offset;
- end = m->offset + m->size - 1;
- if (offset >= start && offset <= end) {
- paddr = m->paddr + offset - start;
- *m_ptr = m;
- return paddr;
- }
- }
- *m_ptr = NULL;
- return 0;
-}
-
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
{
ssize_t acc = 0, tmp;
size_t tsz;
- u64 start, nr_bytes;
- struct vmcore *curr_m = NULL;
+ u64 start;
+ struct vmcore *m = NULL;
if (buflen == 0 || *fpos >= vmcore_size)
return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = elfcorebuf_sz - *fpos;
- if (buflen < tsz)
- tsz = buflen;
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
return -EFAULT;
buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
return acc;
}
- start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
- if (!curr_m)
- return -EINVAL;
-
- while (buflen) {
- tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
+ /* Read Elf note segment */
+ if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
-
- tmp = read_from_oldmem(buffer, tsz, &start, 1);
- if (tmp < 0)
- return tmp;
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+ if (copy_to_user(buffer, kaddr, tsz))
+ return -EFAULT;
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
acc += tsz;
- if (start >= (curr_m->paddr + curr_m->size)) {
- if (curr_m->list.next == &vmcore_list)
- return acc; /*EOF*/
- curr_m = list_entry(curr_m->list.next,
- struct vmcore, list);
- start = curr_m->paddr;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (*fpos < m->offset + m->size) {
+ tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ start = m->paddr + *fpos - m->offset;
+ tmp = read_from_oldmem(buffer, tsz, &start, 1);
+ if (tmp < 0)
+ return tmp;
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
}
}
+
return acc;
}
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ * vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+ return vmalloc_user(notes_sz);
+#else
+ return vzalloc(notes_sz);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+ u64 start, end, len, tsz;
+ struct vmcore *m;
+
+ start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ end = start + size;
+
+ if (size > vmcore_size || end > vmcore_size)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ len = 0;
+
+ if (start < elfcorebuf_sz) {
+ u64 pfn;
+
+ tsz = min(elfcorebuf_sz - (size_t)start, size);
+ pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+ if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+ vma->vm_page_prot))
+ return -EAGAIN;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ if (start < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+ kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+ kaddr, tsz))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (start < m->offset + m->size) {
+ u64 paddr = 0;
+
+ tsz = min_t(size_t, m->offset + m->size - start, size);
+ paddr = m->paddr + start - m->offset;
+ if (remap_pfn_range(vma, vma->vm_start + len,
+ paddr >> PAGE_SHIFT, tsz,
+ vma->vm_page_prot))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+ }
+
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, vma->vm_start, len);
+ return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ return -ENOSYS;
+}
+#endif
+
static const struct file_operations proc_vmcore_operations = {
.read = read_vmcore,
.llseek = default_llseek,
+ .mmap = mmap_vmcore,
};
static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
}
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
- int i;
- u64 size;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr *phdr_ptr;
-
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
- size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
- }
- return size;
-}
-
-static u64 __init get_vmcore_size_elf32(char *elfptr)
-{
- int i;
u64 size;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr *phdr_ptr;
+ struct vmcore *m;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
- size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
+ size = elfsz + elfnotesegsz;
+ list_for_each_entry(m, vc_list, list) {
+ size += m->size;
}
return size;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
Elf64_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
+ while (real_sz < max_sz) {
if (nhdr_ptr->n_namesz == 0)
break;
sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
real_sz += sz;
nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf64_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf64(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf64_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return 0;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
Elf32_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
+ while (real_sz < max_sz) {
if (nhdr_ptr->n_namesz == 0)
break;
sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
real_sz += sz;
nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf32_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf32(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf32_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
* the new offset fields of exported program headers. */
static int __init process_ptload_program_headers_elf64(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
static int __init process_ptload_program_headers_elf32(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
- struct list_head *vc_list)
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
loff_t vmcore_off;
- Elf64_Ehdr *ehdr_ptr;
struct vmcore *m;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
}
}
-/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
- struct list_head *vc_list)
+static void free_elfcorebuf(void)
{
- loff_t vmcore_off;
- Elf32_Ehdr *ehdr_ptr;
- struct vmcore *m;
-
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-
- list_for_each_entry(m, vc_list, list) {
- m->offset = vmcore_off;
- vmcore_off += m->size;
- }
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = NULL;
+ vfree(elfnotes_buf);
+ elfnotes_buf = NULL;
}
static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+ ehdr.e_phnum * sizeof(Elf64_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
rc = parse_crash_elf64_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf64(elfcorebuf);
} else if (e_ident[EI_CLASS] == ELFCLASS32) {
rc = parse_crash_elf32_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf32(elfcorebuf);
} else {
pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
+
+ /* Determine vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+
return 0;
}
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
list_del(&m->list);
kfree(m);
}
- kfree(elfcorebuf);
- elfcorebuf = NULL;
+ free_elfcorebuf();
}
EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.parent_ip = parent_ip;
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
- sizeof(rec), psinfo);
+ 0, sizeof(rec), psinfo);
local_irq_restore(flags);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
if (p->psi->erase)
p->psi->erase(p->type, p->id, p->count,
dentry->d_inode->i_ctime, p->psi);
+ else
+ return -EPERM;
return simple_unlink(dir, dentry);
}
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
case PSTORE_TYPE_MCE:
sprintf(name, "mce-%s-%lld", psname, id);
break;
+ case PSTORE_TYPE_PPC_RTAS:
+ sprintf(name, "rtas-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_PPC_OF:
+ sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_PPC_COMMON:
+ sprintf(name, "powerpc-common-%s-%lld", psname, id);
+ break;
case PSTORE_TYPE_UNKNOWN:
sprintf(name, "unknown-%s-%lld", psname, id);
break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
break;
ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- oopscount, hsize + len, psinfo);
+ oopscount, hsize, hsize + len, psinfo);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
memcpy(psinfo->buf, s, c);
- psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
+ psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
static int pstore_write_compat(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part, int count,
- size_t size, struct pstore_info *psi)
+ size_t hsize, size_t size,
+ struct pstore_info *psi)
{
- return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
+ return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+ size, psi);
}
/*
@@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
{
struct module *owner = psi->owner;
+ if (backend && strcmp(backend, psi->name))
+ return -EPERM;
+
spin_lock(&pstore_lock);
if (psinfo) {
spin_unlock(&pstore_lock);
return -EBUSY;
}
- if (backend && strcmp(backend, psi->name)) {
- spin_unlock(&pstore_lock);
- return -EINVAL;
- }
-
if (!psi->write)
psi->write = pstore_write_compat;
psinfo = psi;
@@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
add_timer(&pstore_timer);
}
+ pr_info("pstore: Registered %s as persistent store backend\n",
+ psi->name);
+
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a8f0d6..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part,
- const char *buf, size_t size,
+ const char *buf,
+ size_t hsize, size_t size,
struct pstore_info *psi)
{
struct ramoops_context *cxt = psi->data;
@@ -399,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
- if (!is_power_of_2(pdata->mem_size))
- pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
if (!is_power_of_2(pdata->record_size))
pdata->record_size = rounddown_pow_of_two(pdata->record_size);
if (!is_power_of_2(pdata->console_size))
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 59337326e288..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
}
/* increase and wrap the start pointer, returning the old value */
-static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
{
int old;
int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
}
/* increase the size counter until it hits the max size */
-static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
{
size_t old;
size_t new;
@@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
} while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
}
+static DEFINE_RAW_SPINLOCK(buffer_lock);
+
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+ int old;
+ int new;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&buffer_lock, flags);
+
+ old = atomic_read(&prz->buffer->start);
+ new = old + a;
+ while (unlikely(new > prz->buffer_size))
+ new -= prz->buffer_size;
+ atomic_set(&prz->buffer->start, new);
+
+ raw_spin_unlock_irqrestore(&buffer_lock, flags);
+
+ return old;
+}
+
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+ size_t old;
+ size_t new;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&buffer_lock, flags);
+
+ old = atomic_read(&prz->buffer->size);
+ if (old == prz->buffer_size)
+ goto exit;
+
+ new = old + a;
+ if (new > prz->buffer_size)
+ new = prz->buffer_size;
+ atomic_set(&prz->buffer->size, new);
+
+exit:
+ raw_spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
+static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
+
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
uint8_t *data, size_t len, uint8_t *ecc)
{
@@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
return NULL;
}
+ buffer_start_add = buffer_start_add_locked;
+ buffer_size_add = buffer_size_add_locked;
+
return ioremap(start, size);
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
return proc_dointvec(table, write, buffer, lenp, ppos);
}
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
{ },
};
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
{
.procname = "quota",
.mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
{ },
};
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
}
/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ goto out;
+
/*
* is this block fully invalidated?
*/
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (!offset && ret) {
+ if (!partial_page && ret) {
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 33532f79b4f7..a958444a75fc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -19,12 +19,13 @@
/*
* LOCKING:
*
- * We rely on new Alexander Viro's super-block locking.
+ * These guys are evicted from procfs as the very first step in ->kill_sb().
*
*/
-static int show_version(struct seq_file *m, struct super_block *sb)
+static int show_version(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
char *format;
if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
@@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb)
#define DJP( x ) le32_to_cpu( jp -> x )
#define JF( x ) ( r -> s_journal -> x )
-static int show_super(struct seq_file *m, struct super_block *sb)
+static int show_super(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
seq_printf(m, "state: \t%s\n"
@@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_per_level(struct seq_file *m, struct super_block *sb)
+static int show_per_level(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
int level;
@@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_bitmap(struct seq_file *m, struct super_block *sb)
+static int show_bitmap(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
seq_printf(m, "free_block: %lu\n"
@@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
+static int show_on_disk_super(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
struct reiserfs_super_block *rs = sb_info->s_rs;
int hash_code = DFL(s_hash_function_code);
@@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_oidmap(struct seq_file *m, struct super_block *sb)
+static int show_oidmap(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
struct reiserfs_super_block *rs = sb_info->s_rs;
unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
@@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_journal(struct seq_file *m, struct super_block *sb)
+static int show_journal(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
struct reiserfs_super_block *rs = r->s_rs;
struct journal_params *jp = &rs->s_v1.s_journal;
@@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
return 0;
}
-/* iterator */
-static int test_sb(struct super_block *sb, void *data)
-{
- return data == sb;
-}
-
-static int set_sb(struct super_block *sb, void *data)
-{
- return -ENOENT;
-}
-
-struct reiserfs_seq_private {
- struct super_block *sb;
- int (*show) (struct seq_file *, struct super_block *);
-};
-
-static void *r_start(struct seq_file *m, loff_t * pos)
-{
- struct reiserfs_seq_private *priv = m->private;
- loff_t l = *pos;
-
- if (l)
- return NULL;
-
- if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
- return NULL;
-
- up_write(&priv->sb->s_umount);
- return priv->sb;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t * pos)
-{
- ++*pos;
- if (v)
- deactivate_super(v);
- return NULL;
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
- if (v)
- deactivate_super(v);
-}
-
-static int r_show(struct seq_file *m, void *v)
-{
- struct reiserfs_seq_private *priv = m->private;
- return priv->show(m, v);
-}
-
-static const struct seq_operations r_ops = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = r_show,
-};
-
static int r_open(struct inode *inode, struct file *file)
{
- struct reiserfs_seq_private *priv;
- int ret = seq_open_private(file, &r_ops,
- sizeof(struct reiserfs_seq_private));
-
- if (!ret) {
- struct seq_file *m = file->private_data;
- priv = m->private;
- priv->sb = proc_get_parent_data(inode);
- priv->show = PDE_DATA(inode);
- }
- return ret;
+ return single_open(file, PDE_DATA(inode),
+ proc_get_parent_data(inode));
}
static const struct file_operations r_file_operations = {
.open = r_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
- .owner = THIS_MODULE,
+ .release = single_release,
};
static struct proc_dir_entry *proc_info_root = NULL;
static const char proc_info_root_name[] = "fs/reiserfs";
static void add_file(struct super_block *sb, char *name,
- int (*func) (struct seq_file *, struct super_block *))
+ int (*func) (struct seq_file *, void *))
{
proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
&r_file_operations, func);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f8a23c3078f8..e2e202a07b31 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -499,6 +499,7 @@ int remove_save_link(struct inode *inode, int truncate)
static void reiserfs_kill_sb(struct super_block *s)
{
if (REISERFS_SB(s)) {
+ reiserfs_proc_info_done(s);
/*
* Force any pending inode evictions to occur now. Any
* inodes to be removed that have extended attributes
@@ -554,8 +555,6 @@ static void reiserfs_put_super(struct super_block *s)
REISERFS_SB(s)->reserved_blocks);
}
- reiserfs_proc_info_done(s);
-
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
kfree(s->s_fs_info);
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c27062..35d4adc749d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,8 @@
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/sched/rt.h>
+#include <linux/freezer.h>
+#include <net/busy_poll.h>
#include <asm/uaccess.h>
@@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
set_current_state(state);
if (!pwq->triggered)
- rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ rc = freezable_schedule_hrtimeout_range(expires, slack,
+ HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
@@ -384,9 +387,10 @@ get_max:
#define POLLEX_SET (POLLPRI)
static inline void wait_key_set(poll_table *wait, unsigned long in,
- unsigned long out, unsigned long bit)
+ unsigned long out, unsigned long bit,
+ unsigned int ll_flag)
{
- wait->_key = POLLEX_SET;
+ wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
@@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
@@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+ bool can_busy_loop = false;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) {
- wait_key_set(wait, in, out, bit);
+ wait_key_set(wait, in, out,
+ bit, busy_flag);
mask = (*f_op->poll)(f.file, wait);
}
fdput(f);
@@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval++;
wait->_qproc = NULL;
}
+ /* got something, stop busy polling */
+ if (retval) {
+ can_busy_loop = false;
+ busy_flag = 0;
+
+ /*
+ * only remember a returned
+ * POLL_BUSY_LOOP if we asked for it
+ */
+ } else if (busy_flag & mask)
+ can_busy_loop = true;
+
}
}
if (res_in)
@@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
break;
}
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
@@ -717,7 +748,9 @@ struct poll_list {
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+ bool *can_busy_poll,
+ unsigned int busy_flag)
{
unsigned int mask;
int fd;
@@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
+ pwait->_key |= busy_flag;
mask = f.file->f_op->poll(f.file, pwait);
+ if (mask & busy_flag)
+ *can_busy_poll = true;
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
+ unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+ unsigned long busy_end = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (;;) {
struct poll_list *walk;
+ bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt)) {
+ if (do_pollfd(pfd, pt, &can_busy_loop,
+ busy_flag)) {
count++;
pt->_qproc = NULL;
+ /* found something, stop busy polling */
+ busy_flag = 0;
+ can_busy_loop = false;
}
}
}
@@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
if (count || timed_out)
break;
+ /* only if found POLL_BUSY_LOOP sockets && not out of time */
+ if (can_busy_loop && !need_resched()) {
+ if (!busy_end) {
+ busy_end = busy_loop_end_time();
+ continue;
+ }
+ if (!busy_loop_timeout(busy_end))
+ continue;
+ }
+ busy_flag = 0;
+
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+ struct hlist_node *node;
+
+ for_each_possible_cpu(*cpu) {
+ hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+ if (pos-- == 0)
+ return node;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v: pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu: pointer to cpu "cursor"
+ * @pos: start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+ int *cpu, loff_t *pos)
+{
+ struct hlist_node *node = v;
+
+ ++*pos;
+
+ if (node->next)
+ return node->next;
+
+ for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+ *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+ struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+ if (!hlist_empty(bucket))
+ return bucket->first;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/splice.c b/fs/splice.c
index cc53bd04be8f..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1269,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
* @in: file to splice from
* @ppos: input file offset
* @out: file to splice to
+ * @opos: output file offset
* @len: number of bytes to splice
* @flags: splice modifier flags
*
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..68307c029228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super);
* and want to turn it into a full-blown active reference. grab_super()
* is called with sb_lock held and drops it. Returns 1 in case of
* success, 0 if we had failed (superblock contents was already dead or
- * dying when grab_super() had been called).
+ * dying when grab_super() had been called). Note that this is only
+ * called for superblocks not in rundown mode (== ones still on ->fs_supers
+ * of their type), so increment of ->s_count is OK here.
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
- if (atomic_inc_not_zero(&s->s_active)) {
- spin_unlock(&sb_lock);
- return 1;
- }
- /* it's going away */
s->s_count++;
spin_unlock(&sb_lock);
- /* wait for it to die */
down_write(&s->s_umount);
+ if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+ put_super(s);
+ return 1;
+ }
up_write(&s->s_umount);
put_super(s);
return 0;
@@ -463,11 +463,6 @@ retry:
destroy_super(s);
s = NULL;
}
- down_write(&old->s_umount);
- if (unlikely(!(old->s_flags & MS_BORN))) {
- deactivate_locked_super(old);
- goto retry;
- }
return old;
}
}
@@ -660,10 +655,10 @@ restart:
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
- if (grab_super(sb)) /* drops sb_lock */
- return sb;
- else
+ if (!grab_super(sb))
goto restart;
+ up_write(&sb->s_umount);
+ return sb;
}
}
spin_unlock(&sb_lock);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4cfd742d260d..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
}
/**
- * sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
* @sd: sysfs_dirent of interest
*
* Link @sd into its sibling rbtree which starts from
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
+ if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
}
spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..09a1a25cd145 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp)
{
struct attribute *const* attr;
- int i;
+ struct bin_attribute *const* bin_attr;
- for (i = 0, attr = grp->attrs; *attr; i++, attr++)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->attrs)
+ for (attr = grp->attrs; *attr; attr++)
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->bin_attrs)
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+ sysfs_remove_bin_file(kobj, *bin_attr);
}
static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
struct attribute *const* attr;
+ struct bin_attribute *const* bin_attr;
int error = 0, i;
- for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- umode_t mode = 0;
+ if (grp->attrs) {
+ for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+ umode_t mode = 0;
+
+ /*
+ * In update mode, we're changing the permissions or
+ * visibility. Do this by first removing then
+ * re-adding (if required) the file.
+ */
+ if (update)
+ sysfs_hash_and_remove(dir_sd, NULL,
+ (*attr)->name);
+ if (grp->is_visible) {
+ mode = grp->is_visible(kobj, *attr, i);
+ if (!mode)
+ continue;
+ }
+ error = sysfs_add_file_mode(dir_sd, *attr,
+ SYSFS_KOBJ_ATTR,
+ (*attr)->mode | mode);
+ if (unlikely(error))
+ break;
+ }
+ if (error) {
+ remove_files(dir_sd, kobj, grp);
+ goto exit;
+ }
+ }
- /* in update mode, we're changing the permissions or
- * visibility. Do this by first removing then
- * re-adding (if required) the file */
- if (update)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
- if (grp->is_visible) {
- mode = grp->is_visible(kobj, *attr, i);
- if (!mode)
- continue;
+ if (grp->bin_attrs) {
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ if (update)
+ sysfs_remove_bin_file(kobj, *bin_attr);
+ error = sysfs_create_bin_file(kobj, *bin_attr);
+ if (error)
+ break;
}
- error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
- (*attr)->mode | mode);
- if (unlikely(error))
- break;
+ if (error)
+ remove_files(dir_sd, kobj, grp);
}
- if (error)
- remove_files(dir_sd, kobj, grp);
+exit:
return error;
}
@@ -67,8 +93,8 @@ static int internal_create_group(struct kobject *kobj, int update,
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
return -EINVAL;
- if (!grp->attrs) {
- WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+ if (!grp->attrs && !grp->bin_attrs) {
+ WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
kobj->name, grp->name ? "" : grp->name);
return -EINVAL;
}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
#include <linux/security.h>
#include "sysfs.h"
-extern struct super_block * sysfs_sb;
-
static const struct address_space_operations sysfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/alarmtimer.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
@@ -26,7 +27,10 @@
#include <linux/rcupdate.h>
struct timerfd_ctx {
- struct hrtimer tmr;
+ union {
+ struct hrtimer tmr;
+ struct alarm alarm;
+ } t;
ktime_t tintv;
ktime_t moffs;
wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
static LIST_HEAD(cancel_list);
static DEFINE_SPINLOCK(cancel_lock);
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+ return ctx->clockid == CLOCK_REALTIME_ALARM ||
+ ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
+
/*
* This gets called when the timer event triggers. We set the "expired"
* flag, but we do not re-arm the timer (in case it's necessary,
* tintv.tv64 != 0) until the timer is accessed.
*/
-static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+static void timerfd_triggered(struct timerfd_ctx *ctx)
{
- struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
unsigned long flags;
spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
ctx->ticks++;
wake_up_locked(&ctx->wqh);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+ struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+ t.tmr);
+ timerfd_triggered(ctx);
return HRTIMER_NORESTART;
}
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+ t.alarm);
+ timerfd_triggered(ctx);
+ return ALARMTIMER_NORESTART;
+}
+
/*
* Called when the clock was set to cancel the timers in the cancel
* list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
{
- if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
- (flags & TFD_TIMER_CANCEL_ON_SET)) {
+ if ((ctx->clockid == CLOCK_REALTIME ||
+ ctx->clockid == CLOCK_REALTIME_ALARM) &&
+ (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
if (!ctx->might_cancel) {
ctx->might_cancel = true;
spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
{
ktime_t remaining;
- remaining = hrtimer_expires_remaining(&ctx->tmr);
+ if (isalarm(ctx))
+ remaining = alarm_expires_remaining(&ctx->t.alarm);
+ else
+ remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->expired = 0;
ctx->ticks = 0;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
- hrtimer_init(&ctx->tmr, clockid, htmode);
- hrtimer_set_expires(&ctx->tmr, texp);
- ctx->tmr.function = timerfd_tmrproc;
+
+ if (isalarm(ctx)) {
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ } else {
+ hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_set_expires(&ctx->t.tmr, texp);
+ ctx->t.tmr.function = timerfd_tmrproc;
+ }
+
if (texp.tv64 != 0) {
- hrtimer_start(&ctx->tmr, texp, htmode);
+ if (isalarm(ctx)) {
+ if (flags & TFD_TIMER_ABSTIME)
+ alarm_start(&ctx->t.alarm, texp);
+ else
+ alarm_start_relative(&ctx->t.alarm, texp);
+ } else {
+ hrtimer_start(&ctx->t.tmr, texp, htmode);
+ }
+
if (timerfd_canceled(ctx))
return -ECANCELED;
}
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
struct timerfd_ctx *ctx = file->private_data;
timerfd_remove_cancel(ctx);
- hrtimer_cancel(&ctx->tmr);
+
+ if (isalarm(ctx))
+ alarm_cancel(&ctx->t.alarm);
+ else
+ hrtimer_cancel(&ctx->t.tmr);
kfree_rcu(ctx, rcu);
return 0;
}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
* callback to avoid DoS attacks specifying a very
* short timer period.
*/
- ticks += hrtimer_forward_now(&ctx->tmr,
- ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+ if (isalarm(ctx)) {
+ ticks += alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ticks += hrtimer_forward_now(&ctx->t.tmr,
+ ctx->tintv) - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
ctx->expired = 0;
ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
if ((flags & ~TFD_CREATE_FLAGS) ||
(clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME))
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_REALTIME_ALARM &&
+ clockid != CLOCK_BOOTTIME_ALARM))
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
init_waitqueue_head(&ctx->wqh);
ctx->clockid = clockid;
- hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+
+ if (isalarm(ctx))
+ alarm_init(&ctx->t.alarm,
+ ctx->clockid == CLOCK_REALTIME_ALARM ?
+ ALARM_REALTIME : ALARM_BOOTTIME,
+ timerfd_alarmproc);
+ else
+ hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+
ctx->moffs = ktime_get_monotonic_offset();
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
*/
for (;;) {
spin_lock_irq(&ctx->wqh.lock);
- if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
- break;
+
+ if (isalarm(ctx)) {
+ if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+ break;
+ } else {
+ if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+ break;
+ }
spin_unlock_irq(&ctx->wqh.lock);
cpu_relax();
}
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
* We do not update "ticks" and "expired" since the timer will be
* re-programmed again in the following timerfd_setup() call.
*/
- if (ctx->expired && ctx->tintv.tv64)
- hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+ if (ctx->expired && ctx->tintv.tv64) {
+ if (isalarm(ctx))
+ alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+ else
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+ }
old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
spin_lock_irq(&ctx->wqh.lock);
if (ctx->expired && ctx->tintv.tv64) {
ctx->expired = 0;
- ctx->ticks +=
- hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
- hrtimer_restart(&ctx->tmr);
+
+ if (isalarm(ctx)) {
+ ctx->ticks +=
+ alarm_forward_now(
+ &ctx->t.alarm, ctx->tintv) - 1;
+ alarm_restart(&ctx->t.alarm);
+ } else {
+ ctx->ticks +=
+ hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+ - 1;
+ hrtimer_restart(&ctx->t.tmr);
+ }
}
t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset)
+ if (offset || length < PAGE_CACHE_SIZE)
/* Partial page remains dirty */
return;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
c->vi.ubi_num, c->vi.vol_id, c->vi.name,
- c->ro_mount ? ", R/O mode" : NULL);
+ c->ro_mount ? ", R/O mode" : "");
x = (long long)c->main_lebs * c->leb_size;
y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9ac4057a86c9..839a2bad7f45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -630,6 +630,12 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
struct udf_sb_info *sbi = UDF_SB(sb);
int error = 0;
+ if (sbi->s_lvid_bh) {
+ int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
+ if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+ return -EACCES;
+ }
+
uopt.flags = sbi->s_flags;
uopt.uid = sbi->s_uid;
uopt.gid = sbi->s_gid;
@@ -649,12 +655,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
sbi->s_dmode = uopt.dmode;
write_unlock(&sbi->s_cred_lock);
- if (sbi->s_lvid_bh) {
- int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
- if (write_rev > UDF_MAX_WRITE_VERSION)
- *flags |= MS_RDONLY;
- }
-
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
goto out_unlock;
@@ -843,27 +843,38 @@ static int udf_find_fileset(struct super_block *sb,
return 1;
}
+/*
+ * Load primary Volume Descriptor Sequence
+ *
+ * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence
+ * should be tried.
+ */
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
struct ustr *instr, *outstr;
struct buffer_head *bh;
uint16_t ident;
- int ret = 1;
+ int ret = -ENOMEM;
instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
if (!instr)
- return 1;
+ return -ENOMEM;
outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
if (!outstr)
goto out1;
bh = udf_read_tagged(sb, block, block, &ident);
- if (!bh)
+ if (!bh) {
+ ret = -EAGAIN;
goto out2;
+ }
- BUG_ON(ident != TAG_IDENT_PVD);
+ if (ident != TAG_IDENT_PVD) {
+ ret = -EIO;
+ goto out_bh;
+ }
pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -889,8 +900,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
if (udf_CS0toUTF8(outstr, instr))
udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
- brelse(bh);
ret = 0;
+out_bh:
+ brelse(bh);
out2:
kfree(outstr);
out1:
@@ -947,7 +959,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
if (mdata->s_mirror_fe == NULL) {
udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
- goto error_exit;
+ return -EIO;
}
}
@@ -964,23 +976,18 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
addr.logicalBlockNum, addr.partitionReferenceNum);
mdata->s_bitmap_fe = udf_iget(sb, &addr);
-
if (mdata->s_bitmap_fe == NULL) {
if (sb->s_flags & MS_RDONLY)
udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
- goto error_exit;
+ return -EIO;
}
}
}
udf_debug("udf_load_metadata_files Ok\n");
-
return 0;
-
-error_exit:
- return 1;
}
static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1069,7 +1076,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!map->s_uspace.s_table) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
p_index);
- return 1;
+ return -EIO;
}
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
@@ -1079,7 +1086,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (phd->unallocSpaceBitmap.extLength) {
struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
if (!bitmap)
- return 1;
+ return -ENOMEM;
map->s_uspace.s_bitmap = bitmap;
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
@@ -1102,7 +1109,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!map->s_fspace.s_table) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
p_index);
- return 1;
+ return -EIO;
}
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
@@ -1113,7 +1120,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (phd->freedSpaceBitmap.extLength) {
struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
if (!bitmap)
- return 1;
+ return -ENOMEM;
map->s_fspace.s_bitmap = bitmap;
bitmap->s_extPosition = le32_to_cpu(
phd->freedSpaceBitmap.extPosition);
@@ -1165,7 +1172,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
}
if (!sbi->s_vat_inode)
- return 1;
+ return -EIO;
if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
map->s_type_specific.s_virtual.s_start_offset = 0;
@@ -1177,7 +1184,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
pos = udf_block_map(sbi->s_vat_inode, 0);
bh = sb_bread(sb, pos);
if (!bh)
- return 1;
+ return -EIO;
vat20 = (struct virtualAllocationTable20 *)bh->b_data;
} else {
vat20 = (struct virtualAllocationTable20 *)
@@ -1195,6 +1202,12 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
return 0;
}
+/*
+ * Load partition descriptor block
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor
+ * sequence.
+ */
static int udf_load_partdesc(struct super_block *sb, sector_t block)
{
struct buffer_head *bh;
@@ -1204,13 +1217,15 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
int i, type1_idx;
uint16_t partitionNumber;
uint16_t ident;
- int ret = 0;
+ int ret;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 1;
- if (ident != TAG_IDENT_PD)
+ return -EAGAIN;
+ if (ident != TAG_IDENT_PD) {
+ ret = 0;
goto out_bh;
+ }
p = (struct partitionDesc *)bh->b_data;
partitionNumber = le16_to_cpu(p->partitionNumber);
@@ -1229,10 +1244,13 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
if (i >= sbi->s_partitions) {
udf_debug("Partition (%d) not found in partition map\n",
partitionNumber);
+ ret = 0;
goto out_bh;
}
ret = udf_fill_partdesc_info(sb, p, i);
+ if (ret < 0)
+ goto out_bh;
/*
* Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
@@ -1249,32 +1267,37 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
break;
}
- if (i >= sbi->s_partitions)
+ if (i >= sbi->s_partitions) {
+ ret = 0;
goto out_bh;
+ }
ret = udf_fill_partdesc_info(sb, p, i);
- if (ret)
+ if (ret < 0)
goto out_bh;
if (map->s_partition_type == UDF_METADATA_MAP25) {
ret = udf_load_metadata_files(sb, i);
- if (ret) {
+ if (ret < 0) {
udf_err(sb, "error loading MetaData partition map %d\n",
i);
goto out_bh;
}
} else {
- ret = udf_load_vat(sb, i, type1_idx);
- if (ret)
- goto out_bh;
/*
- * Mark filesystem read-only if we have a partition with
- * virtual map since we don't handle writing to it (we
- * overwrite blocks instead of relocating them).
+ * If we have a partition with virtual map, we don't handle
+ * writing to it (we overwrite blocks instead of relocating
+ * them).
*/
- sb->s_flags |= MS_RDONLY;
- pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
+ goto out_bh;
+ }
+ ret = udf_load_vat(sb, i, type1_idx);
+ if (ret < 0)
+ goto out_bh;
}
+ ret = 0;
out_bh:
/* In case loading failed, we handle cleanup in udf_fill_super */
brelse(bh);
@@ -1340,11 +1363,11 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
uint16_t ident;
struct buffer_head *bh;
unsigned int table_len;
- int ret = 0;
+ int ret;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 1;
+ return -EAGAIN;
BUG_ON(ident != TAG_IDENT_LVD);
lvd = (struct logicalVolDesc *)bh->b_data;
table_len = le32_to_cpu(lvd->mapTableLength);
@@ -1352,7 +1375,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
udf_err(sb, "error loading logical volume descriptor: "
"Partition table too long (%u > %lu)\n", table_len,
sb->s_blocksize - sizeof(*lvd));
- ret = 1;
+ ret = -EIO;
goto out_bh;
}
@@ -1396,11 +1419,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
} else if (!strncmp(upm2->partIdent.ident,
UDF_ID_SPARABLE,
strlen(UDF_ID_SPARABLE))) {
- if (udf_load_sparable_map(sb, map,
- (struct sparablePartitionMap *)gpm) < 0) {
- ret = 1;
+ ret = udf_load_sparable_map(sb, map,
+ (struct sparablePartitionMap *)gpm);
+ if (ret < 0)
goto out_bh;
- }
} else if (!strncmp(upm2->partIdent.ident,
UDF_ID_METADATA,
strlen(UDF_ID_METADATA))) {
@@ -1465,7 +1487,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
}
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
-
+ ret = 0;
out_bh:
brelse(bh);
return ret;
@@ -1503,22 +1525,18 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
- * udf_process_sequence
- *
- * PURPOSE
- * Process a main/reserve volume descriptor sequence.
- *
- * PRE-CONDITIONS
- * sb Pointer to _locked_ superblock.
- * block First block of first extent of the sequence.
- * lastblock Lastblock of first extent of the sequence.
+ * Process a main/reserve volume descriptor sequence.
+ * @block First block of first extent of the sequence.
+ * @lastblock Lastblock of first extent of the sequence.
+ * @fileset There we store extent containing root fileset
*
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
+ * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor
+ * sequence
*/
-static noinline int udf_process_sequence(struct super_block *sb, long block,
- long lastblock, struct kernel_lb_addr *fileset)
+static noinline int udf_process_sequence(
+ struct super_block *sb,
+ sector_t block, sector_t lastblock,
+ struct kernel_lb_addr *fileset)
{
struct buffer_head *bh = NULL;
struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1529,6 +1547,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
uint32_t vdsn;
uint16_t ident;
long next_s = 0, next_e = 0;
+ int ret;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1543,7 +1562,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
udf_err(sb,
"Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
(unsigned long long)block);
- return 1;
+ return -EAGAIN;
}
/* Process each descriptor (ISO 13346 3/8.3-8.4) */
@@ -1616,14 +1635,19 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
*/
if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
udf_err(sb, "Primary Volume Descriptor not found!\n");
- return 1;
+ return -EAGAIN;
+ }
+ ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block);
+ if (ret < 0)
+ return ret;
+
+ if (vds[VDS_POS_LOGICAL_VOL_DESC].block) {
+ ret = udf_load_logicalvol(sb,
+ vds[VDS_POS_LOGICAL_VOL_DESC].block,
+ fileset);
+ if (ret < 0)
+ return ret;
}
- if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
- return 1;
-
- if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
- vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
- return 1;
if (vds[VDS_POS_PARTITION_DESC].block) {
/*
@@ -1632,19 +1656,27 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
*/
for (block = vds[VDS_POS_PARTITION_DESC].block;
block < vds[VDS_POS_TERMINATING_DESC].block;
- block++)
- if (udf_load_partdesc(sb, block))
- return 1;
+ block++) {
+ ret = udf_load_partdesc(sb, block);
+ if (ret < 0)
+ return ret;
+ }
}
return 0;
}
+/*
+ * Load Volume Descriptor Sequence described by anchor in bh
+ *
+ * Returns <0 on error, 0 on success
+ */
static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
struct kernel_lb_addr *fileset)
{
struct anchorVolDescPtr *anchor;
- long main_s, main_e, reserve_s, reserve_e;
+ sector_t main_s, main_e, reserve_s, reserve_e;
+ int ret;
anchor = (struct anchorVolDescPtr *)bh->b_data;
@@ -1662,18 +1694,26 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
/* Process the main & reserve sequences */
/* responsible for finding the PartitionDesc(s) */
- if (!udf_process_sequence(sb, main_s, main_e, fileset))
- return 1;
- udf_sb_free_partitions(sb);
- if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset))
- return 1;
+ ret = udf_process_sequence(sb, main_s, main_e, fileset);
+ if (ret != -EAGAIN)
+ return ret;
udf_sb_free_partitions(sb);
- return 0;
+ ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+ if (ret < 0) {
+ udf_sb_free_partitions(sb);
+ /* No sequence was OK, return -EIO */
+ if (ret == -EAGAIN)
+ ret = -EIO;
+ }
+ return ret;
}
/*
* Check whether there is an anchor block in the given block and
* load Volume Descriptor Sequence if so.
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor
+ * block
*/
static int udf_check_anchor_block(struct super_block *sb, sector_t block,
struct kernel_lb_addr *fileset)
@@ -1685,33 +1725,40 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
udf_fixed_to_variable(block) >=
sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
- return 0;
+ return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 0;
+ return -EAGAIN;
if (ident != TAG_IDENT_AVDP) {
brelse(bh);
- return 0;
+ return -EAGAIN;
}
ret = udf_load_sequence(sb, bh, fileset);
brelse(bh);
return ret;
}
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
- struct kernel_lb_addr *fileset)
+/*
+ * Search for an anchor volume descriptor pointer.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set
+ * of anchors.
+ */
+static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
+ struct kernel_lb_addr *fileset)
{
sector_t last[6];
int i;
struct udf_sb_info *sbi = UDF_SB(sb);
int last_count = 0;
+ int ret;
/* First try user provided anchor */
if (sbi->s_anchor) {
- if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
- return lastblock;
+ ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset);
+ if (ret != -EAGAIN)
+ return ret;
}
/*
* according to spec, anchor is in either:
@@ -1720,39 +1767,46 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
* lastblock
* however, if the disc isn't closed, it could be 512.
*/
- if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
- return lastblock;
+ ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset);
+ if (ret != -EAGAIN)
+ return ret;
/*
* The trouble is which block is the last one. Drives often misreport
* this so we try various possibilities.
*/
- last[last_count++] = lastblock;
- if (lastblock >= 1)
- last[last_count++] = lastblock - 1;
- last[last_count++] = lastblock + 1;
- if (lastblock >= 2)
- last[last_count++] = lastblock - 2;
- if (lastblock >= 150)
- last[last_count++] = lastblock - 150;
- if (lastblock >= 152)
- last[last_count++] = lastblock - 152;
+ last[last_count++] = *lastblock;
+ if (*lastblock >= 1)
+ last[last_count++] = *lastblock - 1;
+ last[last_count++] = *lastblock + 1;
+ if (*lastblock >= 2)
+ last[last_count++] = *lastblock - 2;
+ if (*lastblock >= 150)
+ last[last_count++] = *lastblock - 150;
+ if (*lastblock >= 152)
+ last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
if (last[i] >= sb->s_bdev->bd_inode->i_size >>
sb->s_blocksize_bits)
continue;
- if (udf_check_anchor_block(sb, last[i], fileset))
- return last[i];
+ ret = udf_check_anchor_block(sb, last[i], fileset);
+ if (ret != -EAGAIN) {
+ if (!ret)
+ *lastblock = last[i];
+ return ret;
+ }
if (last[i] < 256)
continue;
- if (udf_check_anchor_block(sb, last[i] - 256, fileset))
- return last[i];
+ ret = udf_check_anchor_block(sb, last[i] - 256, fileset);
+ if (ret != -EAGAIN) {
+ if (!ret)
+ *lastblock = last[i];
+ return ret;
+ }
}
/* Finally try block 512 in case media is open */
- if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
- return last[0];
- return 0;
+ return udf_check_anchor_block(sb, sbi->s_session + 512, fileset);
}
/*
@@ -1760,54 +1814,59 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
* area specified by it. The function expects sbi->s_lastblock to be the last
* block on the media.
*
- * Return 1 if ok, 0 if not found.
- *
+ * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor
+ * was not found.
*/
static int udf_find_anchor(struct super_block *sb,
struct kernel_lb_addr *fileset)
{
- sector_t lastblock;
struct udf_sb_info *sbi = UDF_SB(sb);
+ sector_t lastblock = sbi->s_last_block;
+ int ret;
- lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
- if (lastblock)
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret != -EAGAIN)
goto out;
/* No anchor found? Try VARCONV conversion of block numbers */
UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ lastblock = udf_variable_to_fixed(sbi->s_last_block);
/* Firstly, we try to not convert number of the last block */
- lastblock = udf_scan_anchors(sb,
- udf_variable_to_fixed(sbi->s_last_block),
- fileset);
- if (lastblock)
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret != -EAGAIN)
goto out;
+ lastblock = sbi->s_last_block;
/* Secondly, we try with converted number of the last block */
- lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
- if (!lastblock) {
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret < 0) {
/* VARCONV didn't help. Clear it. */
UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
- return 0;
}
out:
- sbi->s_last_block = lastblock;
- return 1;
+ if (ret == 0)
+ sbi->s_last_block = lastblock;
+ return ret;
}
/*
* Check Volume Structure Descriptor, find Anchor block and load Volume
- * Descriptor Sequence
+ * Descriptor Sequence.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor
+ * block was not found.
*/
static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
int silent, struct kernel_lb_addr *fileset)
{
struct udf_sb_info *sbi = UDF_SB(sb);
loff_t nsr_off;
+ int ret;
if (!sb_set_blocksize(sb, uopt->blocksize)) {
if (!silent)
udf_warn(sb, "Bad block size\n");
- return 0;
+ return -EINVAL;
}
sbi->s_last_block = uopt->lastblock;
if (!uopt->novrs) {
@@ -1828,12 +1887,13 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
/* Look for anchor block and load Volume Descriptor Sequence */
sbi->s_anchor = uopt->anchor;
- if (!udf_find_anchor(sb, fileset)) {
- if (!silent)
+ ret = udf_find_anchor(sb, fileset);
+ if (ret < 0) {
+ if (!silent && ret == -EAGAIN)
udf_warn(sb, "No anchor found\n");
- return 0;
+ return ret;
}
- return 1;
+ return 0;
}
static void udf_open_lvid(struct super_block *sb)
@@ -1939,7 +1999,7 @@ u64 lvid_get_unique_id(struct super_block *sb)
static int udf_fill_super(struct super_block *sb, void *options, int silent)
{
- int ret;
+ int ret = -EINVAL;
struct inode *inode = NULL;
struct udf_options uopt;
struct kernel_lb_addr rootdir, fileset;
@@ -2011,7 +2071,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
} else {
uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
- if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+ if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
if (!silent)
pr_notice("Rescanning with blocksize %d\n",
UDF_DEFAULT_BLOCKSIZE);
@@ -2021,8 +2081,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
}
}
- if (!ret) {
- udf_warn(sb, "No partition found (1)\n");
+ if (ret < 0) {
+ if (ret == -EAGAIN) {
+ udf_warn(sb, "No partition found (1)\n");
+ ret = -EINVAL;
+ }
goto error_out;
}
@@ -2040,9 +2103,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
le16_to_cpu(lvidiu->minUDFReadRev),
UDF_MAX_READ_VERSION);
+ ret = -EINVAL;
+ goto error_out;
+ } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
+ !(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
goto error_out;
- } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
- sb->s_flags |= MS_RDONLY;
+ }
sbi->s_udfrev = minUDFWriteRev;
@@ -2054,17 +2121,20 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!sbi->s_partitions) {
udf_warn(sb, "No partition found (2)\n");
+ ret = -EINVAL;
goto error_out;
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
- UDF_PART_FLAG_READ_ONLY) {
- pr_notice("Partition marked readonly; forcing readonly mount\n");
- sb->s_flags |= MS_RDONLY;
+ UDF_PART_FLAG_READ_ONLY &&
+ !(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
+ goto error_out;
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
udf_warn(sb, "No fileset found\n");
+ ret = -EINVAL;
goto error_out;
}
@@ -2086,6 +2156,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!inode) {
udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
+ ret = -EIO;
goto error_out;
}
@@ -2093,6 +2164,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
udf_err(sb, "Couldn't allocate root dentry\n");
+ ret = -ENOMEM;
goto error_out;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -2113,7 +2185,7 @@ error_out:
kfree(sbi);
sb->s_fs_info = NULL;
- return -EINVAL;
+ return ret;
}
void _udf_err(struct super_block *sb, const char *function,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \
xfs_dir2_sf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_icreate_item.o \
xfs_inode.o \
xfs_log_recover.o \
xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbnoa,
+ args->alignment,
+ args->userdata, *sbnoa,
*slena, &new);
/*
@@ -976,7 +985,8 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbnoa, gtlena, &gtnew);
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- ltbnoa, ltlena, &ltnew);
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..79670869d436 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -910,7 +912,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -940,7 +942,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
@@ -1514,13 +1516,26 @@ xfs_vm_write_failed(
loff_t pos,
unsigned len)
{
- loff_t block_offset = pos & PAGE_MASK;
+ loff_t block_offset;
loff_t block_start;
loff_t block_end;
loff_t from = pos & (PAGE_CACHE_SIZE - 1);
loff_t to = from + len;
struct buffer_head *bh, *head;
+ /*
+ * The request pos offset might be 32 or 64 bit, this is all fine
+ * on 64-bit platform. However, for 64-bit pos request on 32-bit
+ * platform, the high 32-bit will be masked off if we evaluate the
+ * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+ * 0xfffff000 as an unsigned long, hence the result is incorrect
+ * which could cause the following ASSERT failed in most cases.
+ * In order to avoid this, we can evaluate the block_offset of the
+ * start of the page by using shifts rather than masks the mismatch
+ * problem.
+ */
+ block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
ASSERT(block_offset + from == pos);
head = page_buffers(page);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 31d3cd129269..b800fbcafc7f 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -690,6 +690,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 89042848f9ec..05c698ccb238 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1161,6 +1161,24 @@ xfs_bmap_extents_to_btree(
* since the file data needs to get logged so things will stay consistent.
* (The bmap-level manipulations are ok, though).
*/
+void
+xfs_bmap_local_to_extents_empty(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_bytes == 0);
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+ xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
STATIC int /* error */
xfs_bmap_local_to_extents(
xfs_trans_t *tp, /* transaction pointer */
@@ -1174,9 +1192,12 @@ xfs_bmap_local_to_extents(
struct xfs_inode *ip,
struct xfs_ifork *ifp))
{
- int error; /* error return value */
+ int error = 0;
int flags; /* logging flags returned */
xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
/*
* We don't want to deal with the case of keeping inode data inline yet.
@@ -1185,68 +1206,65 @@ xfs_bmap_local_to_extents(
ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+ if (!ifp->if_bytes) {
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags = XFS_ILOG_CORE;
+ goto done;
+ }
+
flags = 0;
error = 0;
- if (ifp->if_bytes) {
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent block */
- xfs_bmbt_rec_host_t *ep;/* extent record pointer */
-
- ASSERT((ifp->if_flags &
- (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = ip->i_mount;
- args.firstblock = *firstblock;
- /*
- * Allocate a block. We know we need only one, since the
- * file currently fits in an inode.
- */
- if (*firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = *firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- args.total = total;
- args.minlen = args.maxlen = args.prod = 1;
- error = xfs_alloc_vextent(&args);
- if (error)
- goto done;
-
- /* Can't fail, the space was reserved. */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(args.len == 1);
- *firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-
- /* initialise the block and copy the data */
- init_fn(tp, bp, ip, ifp);
-
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
- xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_iext_add(ifp, 0, 1);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
- xfs_trans_mod_dquot_byino(tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- flags |= xfs_ilog_fext(whichfork);
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+ XFS_IFINLINE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
} else {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
}
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
+
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+ /* initialise the block and copy the data */
+ init_fn(tp, bp, ip, ifp);
+
+ /* account for the change in fork size and log everything */
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
+
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+
done:
*logflagsp = flags;
return error;
@@ -1323,25 +1341,6 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Block initialisation function for local to extent format conversion.
- *
- * This shouldn't actually be called by anyone, so make sure debug kernels cause
- * a noticable failure.
- */
-STATIC void
-xfs_bmap_local_to_extents_init_fn(
- struct xfs_trans *tp,
- struct xfs_buf *bp,
- struct xfs_inode *ip,
- struct xfs_ifork *ifp)
-{
- ASSERT(0);
- bp->b_ops = &xfs_bmbt_buf_ops;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-}
-
-/*
* Called from xfs_bmap_add_attrfork to handle local format files. Each
* different data fork content type needs a different callout to do the
* conversion. Some are basic and only require special block initialisation
@@ -1381,9 +1380,9 @@ xfs_bmap_add_attrfork_local(
flags, XFS_DATA_FORK,
xfs_symlink_local_to_remote);
- return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
- XFS_DATA_FORK,
- xfs_bmap_local_to_extents_init_fn);
+ /* should only be called for types that support local format data */
+ ASSERT(0);
+ return EFSCORRUPTED;
}
/*
@@ -4907,20 +4906,19 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & XFS_BMAPI_IGSTATE));
ASSERT(tp != NULL);
ASSERT(len > 0);
-
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
@@ -4933,37 +4931,6 @@ xfs_bmapi_write(
XFS_STATS_INC(xs_blk_mapw);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- /*
- * XXX (dgc): This assumes we are only called for inodes that
- * contain content neutral data in local format. Anything that
- * contains caller-specific data in local format that needs
- * transformation to move to a block format needs to do the
- * conversion to extent format itself.
- *
- * Directory data forks and attribute forks handle this
- * themselves, but with the addition of metadata verifiers every
- * data fork in local format now contains caller specific data
- * and as such conversion through this function is likely to be
- * broken.
- *
- * The only likely user of this branch is for remote symlinks,
- * but we cannot overwrite the data fork contents of the symlink
- * (EEXIST occurs higher up the stack) and so it will never go
- * from local format to extent format here. Hence I don't think
- * this branch is ever executed intentionally and we should
- * consider removing it and asserting that xfs_bmapi_write()
- * cannot be called directly on local format forks. i.e. callers
- * are completely responsible for local to extent format
- * conversion, not xfs_bmapi_write().
- */
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
- &bma.logflags, whichfork,
- xfs_bmap_local_to_extents_init_fn);
- if (error)
- goto error0;
- }
-
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516eb..1cf1292d29b7 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -172,6 +172,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
#endif
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
#define XFS_BMDR_SPACE_CALC(nrecs) \
(int)(sizeof(xfs_bmdr_block_t) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
/*
* Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+ if (bip->bli_flags & XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it.
+ * It is not being included in the transaction
+ * commit, so no vectors are used at all.
+ */
+ trace_xfs_buf_item_size_ordered(bip);
+ return XFS_LOG_VEC_ORDERED;
+ }
+
/*
* the vector count is based on the number of buffer vectors we have
* dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
goto out;
}
+
/*
* Fill in an iovec for each set of contiguous chunks.
*/
@@ -299,18 +310,36 @@ xfs_buf_item_format(
/*
* If it is an inode buffer, transfer the in-memory state to the
- * format flags and clear the in-memory state. We do not transfer
+ * format flags and clear the in-memory state.
+ *
+ * For buffer based inode allocation, we do not transfer
* this state if the inode buffer allocation has not yet been committed
* to the log as setting the XFS_BLI_INODE_BUF flag will prevent
* correct replay of the inode allocation.
+ *
+ * For icreate item based inode allocation, the buffers aren't written
+ * to the journal during allocation, and hence we should always tag the
+ * buffer as an inode buffer so that the correct unlinked list replay
+ * occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
+ if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+ XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so don't format it.
+ */
+ trace_xfs_buf_item_format_ordered(bip);
+ return;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
vecp = xfs_buf_item_format_segment(bip, vecp, offset,
&bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted, clean, i;
- uint hold;
+ bool clean;
+ bool aborted;
+ int flags;
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
*/
- aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
+ * Before possibly freeing the buf item, copy the per-transaction state
+ * so we can reference it safely later after clearing it from the
+ * buffer log item.
*/
- hold = bip->bli_flags & XFS_BLI_HOLD;
-
- /* Clear the per transaction state. */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+ flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
* If the buf item is marked stale, then don't do anything. We'll
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
+ if (flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
* be the only reference to the buf item, so we free it anyway
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
+ *
+ * Ordered buffers are dirty but may have no recorded changes, so ensure
+ * we only release clean items here.
*/
- clean = 1;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = 0;
- break;
+ clean = (flags & XFS_BLI_DIRTY) ? false : true;
+ if (clean) {
+ int i;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = false;
+ break;
+ }
}
}
if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
} else
atomic_dec(&bip->bli_refcount);
- if (!hold)
+ if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
}
@@ -842,12 +877,6 @@ xfs_buf_item_log(
struct xfs_buf *bp = bip->bli_buf;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* walk each buffer segment and mark them dirty appropriately.
*/
start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
*/
uint
@@ -907,11 +936,11 @@ void
xfs_buf_item_relse(
xfs_buf_t *bp)
{
- xfs_buf_log_item_t *bip;
+ xfs_buf_log_item_t *bip = bp->b_fspriv;
trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
- bip = bp->b_fspriv;
bp->b_fspriv = bip->bli_item.li_bio_list;
if (bp->b_fspriv == NULL)
bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
#define XFS_BLI_INODE_BUF 0x40
+#define XFS_BLI_ORDERED 0x80
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
{ XFS_BLI_LOGGED, "LOGGED" }, \
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
{ XFS_BLI_STALE_INODE, "STALE_INODE" }, \
- { XFS_BLI_INODE_BUF, "INODE_BUF" }
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
+ { XFS_BLI_ORDERED, "ORDERED" }
#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
*/
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
return EINVAL;
if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
return EINVAL;
-
if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f7a0e95d197a..e5869b50dc41 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
* There is a very similar struct icdinode in xfs_inode which matches the
* layout of the first 96 bytes of this structure, but is kept in native
* format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
*/
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
@@ -132,9 +135,6 @@ typedef enum xfs_dinode_fmt {
#define XFS_LITINO(mp, version) \
((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-#define XFS_BROOT_SIZE_ADJ(ip) \
- (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
-
/*
* Inode data & attribute fork sizes, per inode.
*/
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 09aea0247d96..5e7fbd72cf52 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_dir2_format.h"
@@ -1164,13 +1165,15 @@ xfs_dir2_sf_to_block(
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_name name;
+ struct xfs_ifork *ifp;
trace_xfs_dir2_sf_to_block(args);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
/*
* Bomb out if the shortform directory is way too short.
*/
@@ -1179,22 +1182,23 @@ xfs_dir2_sf_to_block(
return XFS_ERROR(EIO);
}
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes == dp->i_d.di_size);
+ ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_d.di_nextents == 0);
/*
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
- memcpy(sfp, oldsfp, dp->i_df.if_bytes);
+ sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ memcpy(sfp, oldsfp, ifp->if_bytes);
- xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
* Add block 0 to the inode.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e0cc1243a8aa..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp = *bpp;
struct xfs_bmbt_irec *map = mip->map;
+ struct blk_plug plug;
int error = 0;
int length;
int i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
*/
+ blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
}
}
}
+ blk_finish_plug(&plug);
out:
*bpp = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..0adf27ecf3f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
xfs_buf_t **O_bpp,
uint flags)
{
- xfs_bmbt_irec_t map;
- int nmaps = 1, error;
- xfs_buf_t *bp;
- xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
- xfs_mount_t *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- xfs_trans_t *tp = (tpp ? *tpp : NULL);
+ struct xfs_bmbt_irec map;
+ int nmaps = 1, error;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ struct xfs_trans *tp = (tpp ? *tpp : NULL);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
@@ -804,7 +804,7 @@ xfs_qm_dqget(
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
@@ -936,6 +936,7 @@ xfs_qm_dqput_final(
{
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
trace_xfs_dqput_free(dqp);
@@ -949,21 +950,29 @@ xfs_qm_dqput_final(
/*
* If we just added a udquot to the freelist, then we want to release
- * the gdquot reference that it (probably) has. Otherwise it'll keep
- * the gdquot from getting reclaimed.
+ * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
+ * keep the gdquot/pdquot from getting reclaimed.
*/
gdqp = dqp->q_gdquot;
if (gdqp) {
xfs_dqlock(gdqp);
dqp->q_gdquot = NULL;
}
+
+ pdqp = dqp->q_pdquot;
+ if (pdqp) {
+ xfs_dqlock(pdqp);
+ dqp->q_pdquot = NULL;
+ }
xfs_dqunlock(dqp);
/*
- * If we had a group quota hint, release it now.
+ * If we had a group/project quota hint, release it now.
*/
if (gdqp)
xfs_qm_dqput(gdqp);
+ if (pdqp)
+ xfs_qm_dqput(pdqp);
}
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..55abbca2883d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -53,6 +53,7 @@ typedef struct xfs_dquot {
xfs_fileoff_t q_fileoffset; /* offset in quotas file */
struct xfs_dquot*q_gdquot; /* group dquot, hint only */
+ struct xfs_dquot*q_pdquot; /* project dquot, hint only */
xfs_disk_dquot_t q_core; /* actual usage & quotas */
xfs_dq_logitem_t q_logitem; /* dquot log item */
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
@@ -118,8 +119,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
case XFS_DQ_USER:
return XFS_IS_UQUOTA_ON(mp);
case XFS_DQ_GROUP:
+ return XFS_IS_GQUOTA_ON(mp);
case XFS_DQ_PROJ:
- return XFS_IS_OQUOTA_ON(mp);
+ return XFS_IS_PQUOTA_ON(mp);
default:
return 0;
}
@@ -131,8 +133,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
case XFS_DQ_USER:
return ip->i_udquot;
case XFS_DQ_GROUP:
- case XFS_DQ_PROJ:
return ip->i_gdquot;
+ case XFS_DQ_PROJ:
+ return ip->i_pdquot;
default:
return NULL;
}
@@ -143,10 +146,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
- XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
- XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
if (!bp)
return EIO;
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_bmap.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
/*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
#endif
/*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
*/
-STATIC int
+int
xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct list_head *buffer_list,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno,
XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+ mp->m_sb.sb_inodesize, length, gen);
} else if (xfs_sb_version_hasnlink(&mp->m_sb))
version = 2;
else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction
- * to log a whole cluster of inodes instead of all the
- * individual transactions causing a lot of log traffic.
- */
+
+ /* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
ino++;
uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
xfs_dinode_calc_crc(mp, free);
- } else {
+ } else if (tp) {
/* just log the inode core */
xfs_trans_log_buf(tp, fbuf, ioffset,
ioffset + isize - 1);
}
}
- if (version == 3) {
- /* need to log the entire buffer */
- xfs_trans_log_buf(tp, fbuf, 0,
- BBTOB(fbuf->b_length) - 1);
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
}
- xfs_trans_inode_alloc_buf(tp, fbuf);
}
return 0;
}
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
- */
+ */
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
args.len, prandom_u32());
if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
struct xfs_btree_cur *cur,
xfs_agino_t agino,
xfs_inobt_rec_incore_t *rec,
- int *done,
- int left)
+ int *done)
{
int error;
int i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
pag->pagl_leftrec != NULLAGINO &&
pag->pagl_rightrec != NULLAGINO) {
error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
- &trec, &doneleft, 1);
+ &trec, &doneleft);
if (error)
goto error1;
error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
- &rec, &doneright, 0);
+ &rec, &doneright);
if (error)
goto error1;
} else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
+
extern const struct xfs_buf_ops xfs_agi_buf_ops;
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..3f90e1ceb8d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,9 @@ xfs_iget_cache_miss(
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
+ ip->i_udquot = NULL;
+ ip->i_gdquot = NULL;
+ ip->i_pdquot = NULL;
xfs_iflags_set(ip, iflags);
/* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
-int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+
+kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+ log_vector->i_len = sizeof(struct xfs_icreate_log);
+ log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ ASSERT(0);
+ return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+ .iop_size = xfs_icreate_item_size,
+ .iop_format = xfs_icreate_item_format,
+ .iop_pin = xfs_icreate_item_pin,
+ .iop_unpin = xfs_icreate_item_unpin,
+ .iop_push = xfs_icreate_item_push,
+ .iop_unlock = xfs_icreate_item_unlock,
+ .iop_committed = xfs_icreate_item_committed,
+ .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ unsigned int count,
+ unsigned int inode_size,
+ xfs_agblock_t length,
+ unsigned int generation)
+{
+ struct xfs_icreate_item *icp;
+
+ icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+ xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+ &xfs_icreate_item_ops);
+
+ icp->ic_format.icl_type = XFS_LI_ICREATE;
+ icp->ic_format.icl_size = 1; /* single vector */
+ icp->ic_format.icl_ag = cpu_to_be32(agno);
+ icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+ icp->ic_format.icl_count = cpu_to_be32(count);
+ icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+ icp->ic_format.icl_length = cpu_to_be32(length);
+ icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+ xfs_trans_add_item(tp, &icp->ic_item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H 1
+
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+ __uint16_t icl_type; /* type of log format structure */
+ __uint16_t icl_size; /* size of log format structure */
+ __be32 icl_ag; /* ag being allocated in */
+ __be32 icl_agbno; /* start block of inode range */
+ __be32 icl_count; /* number of inodes to initialise */
+ __be32 icl_isize; /* size of inodes */
+ __be32 icl_length; /* length of extent to initialise */
+ __be32 icl_gen; /* inode generation number to use */
+};
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+ struct xfs_log_item ic_item;
+ struct xfs_icreate_log ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, unsigned int count,
+ unsigned int inode_size, xfs_agblock_t length,
+ unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..bb262c25c8de 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
to->di_lsn = cpu_to_be64(from->di_lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
}
}
@@ -1028,6 +1030,15 @@ xfs_dinode_calc_crc(
/*
* Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
*/
int
xfs_iread(
@@ -1047,6 +1058,23 @@ xfs_iread(
if (error)
return error;
+ /* shortcut IO on inode allocation if possible */
+ if ((iget_flags & XFS_IGET_CREATE) &&
+ xfs_sb_version_hascrc(&mp->m_sb) &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ /* initialise the on-disk inode core */
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_d.di_magic = XFS_DINODE_MAGIC;
+ ip->i_d.di_gen = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ip->i_d.di_version = 3;
+ ip->i_d.di_ino = ip->i_ino;
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ } else
+ ip->i_d.di_version = 2;
+ return 0;
+ }
+
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
@@ -1133,17 +1161,16 @@ xfs_iread(
xfs_buf_set_ref(bp, XFS_INO_REF);
/*
- * Use xfs_trans_brelse() to release the buffer containing the
- * on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal
+ * Use xfs_trans_brelse() to release the buffer containing the on-disk
+ * inode, because it was acquired with xfs_trans_read_buf() in
+ * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
* brelse(). If we're within a transaction, then xfs_trans_brelse()
* will only release the buffer if it is not dirty within the
* transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be
- * locking the new in-core inode before putting it in the hash
- * table where other processes can find it. Thus we don't have
- * to worry about the inode being changed just because we released
- * the buffer.
+ * because inodes on disk are never destroyed and we will be locking the
+ * new in-core inode before putting it in the cache where other
+ * processes can find it. Thus we don't have to worry about the inode
+ * being changed just because we released the buffer.
*/
out_brelse:
xfs_trans_brelse(tp, bp);
@@ -2028,8 +2055,6 @@ xfs_ifree(
int error;
int delete;
xfs_ino_t first_ino;
- xfs_dinode_t *dip;
- xfs_buf_t *ibp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2067,13 @@ xfs_ifree(
* Pull the on-disk inode from the AGI unlinked list.
*/
error = xfs_iunlink_remove(tp, ip);
- if (error != 0) {
+ if (error)
return error;
- }
error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
- if (error != 0) {
+ if (error)
return error;
- }
+
ip->i_d.di_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2085,10 @@ xfs_ifree(
* by reincarnations of this inode.
*/
ip->i_d.di_gen++;
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error)
- return error;
-
- /*
- * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
- * from picking up this inode when it is reclaimed (its incore state
- * initialzed but not flushed to disk yet). The in-core di_mode is
- * already cleared and a corresponding transaction logged.
- * The hack here just synchronizes the in-core to on-disk
- * di_mode value in advance before the actual inode sync to disk.
- * This is OK because the inode is already unlinked and would never
- * change its di_mode again for this inode generation.
- * This is a temporary hack that would require a proper fix
- * in the future.
- */
- dip->di_mode = 0;
-
- if (delete) {
+ if (delete)
error = xfs_ifree_cluster(ip, tp, first_ino);
- }
return error;
}
@@ -2160,8 +2163,8 @@ xfs_iroot_realloc(
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
return;
}
@@ -2214,8 +2217,9 @@ xfs_iroot_realloc(
kmem_free(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+ if (ifp->if_broot)
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
return;
}
@@ -2526,9 +2530,8 @@ xfs_iflush_fork(
if ((iip->ili_fields & brootflag[whichfork]) &&
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
- ASSERT(ifp->if_broot_bytes <=
- (XFS_IFORK_SIZE(ip, whichfork) +
- XFS_BROOT_SIZE_ADJ(ip)));
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2886,12 +2889,18 @@ xfs_iflush_int(
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
goto corrupt_out;
}
+
/*
- * bump the flush iteration count, used to detect flushes which
- * postdate a log record during recovery. This is redundant as we now
- * log every change and hence this can't happen. Still, it doesn't hurt.
+ * Inode item log recovery for v1/v2 inodes are dependent on the
+ * di_flushiter count for correct sequencing. We bump the flush
+ * iteration count so we can detect flushes which postdate a log record
+ * during recovery. This is redundant as we now log every change and
+ * hence this can't happen but we need to still do it to ensure
+ * backwards compatibility with old kernels that predate logging all
+ * inode changes.
*/
- ip->i_d.di_flushiter++;
+ if (ip->i_d.di_version < 3)
+ ip->i_d.di_flushiter++;
/*
* Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91129794aaec..b55fd347ab5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -250,6 +250,7 @@ typedef struct xfs_inode {
struct xfs_mount *i_mount; /* fs mount struct ptr */
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e999680094a..6e2bca5d44d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -248,7 +248,7 @@ xfs_open_by_handle(
goto out_dput;
}
- fd = get_unused_fd();
+ fd = get_unused_fd_flags(0);
if (fd < 0) {
error = fd;
goto out_dput;
@@ -928,7 +928,7 @@ xfs_ioctl_setattr(
struct xfs_trans *tp;
unsigned int lock_flags = 0;
struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
@@ -957,7 +957,7 @@ xfs_ioctl_setattr(
if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+ XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
if (code)
return code;
}
@@ -994,8 +994,8 @@ xfs_ioctl_setattr(
XFS_IS_PQUOTA_ON(mp) &&
xfs_get_projid(ip) != fa->fsx_projid) {
ASSERT(tp);
- code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
+ pdqp, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
goto error_return;
@@ -1113,7 +1113,7 @@ xfs_ioctl_setattr(
if (xfs_get_projid(ip) != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
+ &ip->i_pdquot, pdqp);
}
xfs_set_projid(ip, fa->fsx_projid);
@@ -1160,13 +1160,13 @@ xfs_ioctl_setattr(
*/
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
return code;
error_return:
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
xfs_trans_cancel(tp, 0);
if (lock_flags)
xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
return 0;
/*
+ * If the file is smaller than the minimum prealloc and we are using
+ * dynamic preallocation, don't do any preallocation at all as it is
+ * likely this is the only write to the file that is going to be done.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+ return 0;
+
+ /*
* If there are any real blocks past eof, then don't
* do any speculative allocation.
*/
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
return 0;
+ /* If the file is small, then use the minimum prealloc */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+ return 0;
+
/*
* As we write multiple pages, the offset will always align to the
* start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..96dda62d497b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -467,9 +467,6 @@ xfs_setattr_mode(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
-
ip->i_d.di_mode &= S_IFMT;
ip->i_d.di_mode |= mode & ~S_IFMT;
@@ -495,15 +492,18 @@ xfs_setattr_nonsize(
trace_xfs_setattr(ip);
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ /* If acls are being inherited, we already have this checked */
+ if (!(flags & XFS_ATTR_NOACL)) {
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- error = -inode_change_ok(inode, iattr);
- if (error)
- return XFS_ERROR(error);
+ error = -inode_change_ok(inode, iattr);
+ if (error)
+ return XFS_ERROR(error);
+ }
ASSERT((mask & ATTR_SIZE) == 0);
@@ -539,7 +539,7 @@ xfs_setattr_nonsize(
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
- qflags, &udqp, &gdqp);
+ qflags, &udqp, &gdqp, NULL);
if (error)
return error;
}
@@ -575,7 +575,7 @@ xfs_setattr_nonsize(
(XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
ASSERT(tp);
error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
goto out_trans_cancel;
@@ -987,7 +987,8 @@ xfs_fiemap_format(
if (bmv->bmv_oflags & BMV_OF_PREALLOC)
fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
physical = 0; /* no block yet */
}
if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..b93e14b86754 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
{
return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
(xfs_sb_version_hasquota(&mp->m_sb) &&
- (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+ xfs_is_quota_inode(&mp->m_sb, ino)));
}
/*
@@ -221,7 +221,6 @@ xfs_bulkstat(
char __user *ubufp; /* pointer into user's buffer */
int ubelem; /* spaces used in user's buffer */
int ubused; /* bytes used by formatter */
- xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
/*
* Get the last inode value, see if there's nothing to do.
@@ -263,7 +262,6 @@ xfs_bulkstat(
rval = 0;
while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
cond_resched();
- bp = NULL;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error) {
/*
@@ -383,11 +381,13 @@ xfs_bulkstat(
* Also start read-ahead now for this chunk.
*/
if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ struct blk_plug plug;
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
+ blk_start_plug(&plug);
agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +399,7 @@ xfs_bulkstat(
agbno, nbcluster,
&xfs_inode_buf_ops);
}
+ blk_finish_plug(&plug);
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
@@ -433,27 +434,7 @@ xfs_bulkstat(
irbp->ir_freecount < XFS_INODES_PER_CHUNK;
chunkidx++, clustidx++, agino++) {
ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
- /*
- * Recompute agbno if this is the
- * first inode of the cluster.
- *
- * Careful with clustidx. There can be
- * multiple clusters per chunk, a single
- * cluster per chunk or a cluster that has
- * inodes represented from several different
- * chunks (if blocksize is large).
- *
- * Because of this, the starting clustidx is
- * initialized to zero in this loop but must
- * later be reset after reading in the cluster
- * buffer.
- */
- if ((chunkidx & (nicluster - 1)) == 0) {
- agbno = XFS_AGINO_TO_AGBNO(mp,
- irbp->ir_startino) +
- ((chunkidx & nimask) >>
- mp->m_sb.sb_inopblog);
- }
+
ino = XFS_AGINO_TO_INO(mp, agno, agino);
/*
* Skip if this inode is free.
@@ -499,10 +480,6 @@ xfs_bulkstat(
cond_resched();
}
-
- if (bp)
- xfs_buf_relse(bp);
-
/*
* Set up for the next loop iteration.
*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..bf89eb97fefd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1941,7 +1941,7 @@ xlog_print_tic_res(
xfs_alert_tag(mp, XFS_PTAG_LOGRES,
"xlog_write: reservation ran out. Need to up reservation");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
}
/*
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+ continue;
+
headers += lv->lv_niovecs;
for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
- while (lv && index < lv->lv_niovecs) {
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2236,13 +2240,22 @@ xlog_write(
* This loop writes out as many regions as can fit in the amount
* of space which was allocated by xlog_state_get_iclog_space().
*/
- while (lv && index < lv->lv_niovecs) {
- struct xfs_log_iovec *reg = &vecp[index];
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+ struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
+ bool ordered = false;
+
+ /* ordered log vectors have no regions to write */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(lv->lv_niovecs == 0);
+ ordered = true;
+ goto next_lv;
+ }
+ reg = &vecp[index];
ASSERT(reg->i_len % sizeof(__int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2302,12 +2315,13 @@ xlog_write(
break;
if (++index == lv->lv_niovecs) {
+next_lv:
lv = lv->lv_next;
index = 0;
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0) {
+ if (record_cnt == 0 && ordered == false) {
if (!lv)
return 0;
break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XLOG_REG_TYPE_UNMOUNT 17
#define XLOG_REG_TYPE_COMMIT 18
#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_MAX 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_MAX 20
typedef struct xfs_log_iovec {
void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
int lv_buf_len; /* size of formatted buffer */
};
+#define XFS_LOG_VEC_ORDERED (-1)
+
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
int index;
int len = 0;
uint niovecs;
+ bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
if (!niovecs)
continue;
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ }
+
new_lv = kmem_zalloc(sizeof(*new_lv) +
niovecs * sizeof(struct xfs_log_iovec),
KM_SLEEP|KM_NOFS);
+ new_lv->lv_item = lidp->lid_item;
+ new_lv->lv_niovecs = niovecs;
+ if (ordered) {
+ /* track as an ordered logvec */
+ new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ goto next;
+ }
+
/* The allocated iovec region lies beyond the log vector. */
new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
- new_lv->lv_niovecs = niovecs;
- new_lv->lv_item = lidp->lid_item;
/* build the vector array and calculate it's length */
IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
}
ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+next:
if (!ret_lv)
ret_lv = new_lv;
else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
if (old) {
/* existing lv on log item, space used is a delta */
- ASSERT(!list_empty(&lv->lv_item->li_cil));
- ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+ ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
+ old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+
+ /*
+ * If the new item is ordered, keep the old one that is already
+ * tracking dirty or ordered regions
+ */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(!lv->lv_buf);
+ kmem_free(lv);
+ return;
+ }
*len += lv->lv_buf_len - old->lv_buf_len;
*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
} else {
/* new lv, must pin the log item */
ASSERT(!lv->lv_item->li_lv);
- ASSERT(list_empty(&lv->lv_item->li_cil));
- *len += lv->lv_buf_len;
- *diff_iovecs += lv->lv_niovecs;
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ *len += lv->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs;
+ }
IOP_PIN(lv->lv_item);
}
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- for (lv = log_vector; lv; lv = lv->lv_next)
- xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
- /* account for space used by new iovec headers */
- len += diff_iovecs * sizeof(xlog_op_header_t);
-
spin_lock(&cil->xc_cil_lock);
+ for (lv = log_vector; lv; ) {
+ struct xfs_log_vec *next = lv->lv_next;
- /* move the items to the tail of the CIL */
- for (lv = log_vector; lv; lv = lv->lv_next)
+ ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
+ lv->lv_next = NULL;
+
+ /*
+ * xfs_cil_prepare_item() may free the lv, so move the item on
+ * the CIL first.
+ */
list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+ xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+ lv = next;
+ }
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
ctx->nvecs += diff_iovecs;
/*
@@ -381,9 +416,7 @@ xlog_cil_push(
struct xfs_cil_ctx *new_ctx;
struct xlog_in_core *commit_iclog;
struct xlog_ticket *tic;
- int num_lv;
int num_iovecs;
- int len;
int error = 0;
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
* side which is currently locked out by the flush lock.
*/
lv = NULL;
- num_lv = 0;
num_iovecs = 0;
- len = 0;
while (!list_empty(&cil->xc_cil)) {
struct xfs_log_item *item;
- int i;
item = list_first_entry(&cil->xc_cil,
struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
lv->lv_next = item->li_lv;
lv = item->li_lv;
item->li_lv = NULL;
-
- num_lv++;
num_iovecs += lv->lv_niovecs;
- for (i = 0; i < lv->lv_niovecs; i++)
- len += lv->lv_iovecp[i].i_len;
}
/*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = log->l_cilp->xc_ctx->sequence;
+ /* xlog_cil_insert_items() destroys log_vector list */
xlog_cil_insert_items(log, log_vector, tp->t_ticket);
/* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..7681b19aa5dc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_icreate_item.h"
/* Need all the magic numbers and buffer ops structures from these headers */
#include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
* form the cancelled buffer table. Hence they have tobe done last.
*
* 3. Inode allocation buffers must be replayed before inode items that
- * read the buffer and replay changes into it.
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
*
* 4. Inode unlink buffers must be replayed after inode items are replayed.
* This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
* from all the other buffers and move them to last.
*
* Hence, 4 lists, in order from head to tail:
- * - buffer_list for all buffers except cancelled/inode unlink buffers
- * - item_list for all non-buffer items
- * - inode_buffer_list for inode unlink buffers
- * - cancel_list for the cancelled buffers
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
@@ -2578,8 +2592,16 @@ xlog_recover_inode_pass2(
goto error;
}
- /* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
@@ -2594,6 +2616,7 @@ xlog_recover_inode_pass2(
goto error;
}
}
+
/* Take the opportunity to reset the flush iteration count */
dicp->di_flushiter = 0;
@@ -2982,6 +3005,93 @@ xlog_recover_efd_pass2(
}
/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
+
+ /* existing allocation is fixed value */
+ ASSERT(count == XFS_IALLOC_INODES(mp));
+ ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+ if (count != XFS_IALLOC_INODES(mp) ||
+ length != XFS_IALLOC_BLOCKS(mp)) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
+ }
+
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
+ return 0;
+}
+
+/*
* Free up any resources allocated by the transaction
*
* Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3133,7 @@ xlog_recover_commit_pass1(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
/* nothing to do in pass 1 */
return 0;
default:
@@ -3053,6 +3164,8 @@ xlog_recover_commit_pass2(
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
return xlog_recover_dquot_pass2(log, buffer_list, item);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..a0fa8021330d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -379,6 +379,19 @@ xfs_mount_validate_sb(
}
}
+ if (xfs_sb_version_has_pquotino(sbp)) {
+ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+ xfs_notice(mp,
+ "Version 5 of Super block has XFS_OQUOTA bits.\n");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+ xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -561,6 +574,55 @@ out_unwind:
return error;
}
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+ /*
+ * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+ * leads to in-core values having two different values for a quota
+ * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+ * NULLFSINO.
+ *
+ * Note that this change affect only the in-core values. These
+ * values are not written back to disk unless any quota information
+ * is written to the disk. Even in that case, sb_pquotino field is
+ * not written to disk unless the superblock supports pquotino.
+ */
+ if (sbp->sb_uquotino == 0)
+ sbp->sb_uquotino = NULLFSINO;
+ if (sbp->sb_gquotino == 0)
+ sbp->sb_gquotino = NULLFSINO;
+ if (sbp->sb_pquotino == 0)
+ sbp->sb_pquotino = NULLFSINO;
+
+ /*
+ * We need to do these manipilations only if we are working
+ * with an older version of on-disk superblock.
+ */
+ if (xfs_sb_version_has_pquotino(sbp))
+ return;
+
+ if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+ if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+ sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+ if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
+ /*
+ * In older version of superblock, on-disk superblock only
+ * has sb_gquotino, and in-core superblock has both sb_gquotino
+ * and sb_pquotino. But, only one of them is supported at any
+ * point of time. So, if PQUOTA is set in disk superblock,
+ * copy over sb_gquotino to sb_pquotino.
+ */
+ sbp->sb_pquotino = sbp->sb_gquotino;
+ sbp->sb_gquotino = NULLFSINO;
+ }
+}
+
void
xfs_sb_from_disk(
struct xfs_sb *to,
@@ -622,6 +684,57 @@ xfs_sb_from_disk(
to->sb_lsn = be64_to_cpu(from->sb_lsn);
}
+static inline void
+xfs_sb_quota_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t *fields)
+{
+ __uint16_t qflags = from->sb_qflags;
+
+ /*
+ * We need to do these manipilations only if we are working
+ * with an older version of on-disk superblock.
+ */
+ if (xfs_sb_version_has_pquotino(from))
+ return;
+
+ if (*fields & XFS_SB_QFLAGS) {
+ /*
+ * The in-core version of sb_qflags do not have
+ * XFS_OQUOTA_* flags, whereas the on-disk version
+ * does. So, convert incore XFS_{PG}QUOTA_* flags
+ * to on-disk XFS_OQUOTA_* flags.
+ */
+ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+ XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+ if (from->sb_qflags &
+ (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+ qflags |= XFS_OQUOTA_ENFD;
+ if (from->sb_qflags &
+ (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+ qflags |= XFS_OQUOTA_CHKD;
+ to->sb_qflags = cpu_to_be16(qflags);
+ *fields &= ~XFS_SB_QFLAGS;
+ }
+
+ /*
+ * GQUOTINO and PQUOTINO cannot be used together in versions
+ * of superblock that do not have pquotino. from->sb_flags
+ * tells us which quota is active and should be copied to
+ * disk.
+ */
+ if ((*fields & XFS_SB_GQUOTINO) &&
+ (from->sb_qflags & XFS_GQUOTA_ACCT))
+ to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+ else if ((*fields & XFS_SB_PQUOTINO) &&
+ (from->sb_qflags & XFS_PQUOTA_ACCT))
+ to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+
+ *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+
/*
* Copy in core superblock to ondisk one.
*
@@ -643,6 +756,7 @@ xfs_sb_to_disk(
if (!fields)
return;
+ xfs_sb_quota_to_disk(to, from, &fields);
while (fields) {
f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
first = xfs_sb_info[f].offset;
@@ -835,6 +949,7 @@ reread:
*/
xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(&mp->m_sb);
/*
* We must be able to do sector-sized and sector-aligned IO.
*/
@@ -987,42 +1102,27 @@ xfs_update_alignment(xfs_mount_t *mp)
*/
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. blocksize)");
- return XFS_ERROR(EINVAL);
- }
- mp->m_dalign = mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
} else {
/*
* Convert the stripe unit and width to FSBs.
*/
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. ag size)");
- return XFS_ERROR(EINVAL);
- }
xfs_warn(mp,
- "stripe alignment turned off: sunit(%d)/swidth(%d) "
- "incompatible with agsize(%d)",
- mp->m_dalign, mp->m_swidth,
- sbp->sb_agblocks);
-
- mp->m_dalign = 0;
- mp->m_swidth = 0;
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ sbp->sb_agblocks);
+ return XFS_ERROR(EINVAL);
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "sunit(%d) less than bsize(%d)",
- mp->m_dalign,
- mp->m_blockmask +1);
- return XFS_ERROR(EINVAL);
- }
- mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
}
}
@@ -1039,6 +1139,10 @@ xfs_update_alignment(xfs_mount_t *mp)
sbp->sb_width = mp->m_swidth;
mp->m_update_flags |= XFS_SB_WIDTH;
}
+ } else {
+ xfs_warn(mp,
+ "cannot change alignment: superblock does not support data alignment");
+ return XFS_ERROR(EINVAL);
}
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
uint m_chsize; /* size of next field */
- struct xfs_chash *m_chash; /* fs private inode per-cluster
- * hash table */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
operations, typically for
disk errors in metadata */
#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
- user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
allocations */
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..1e2361d0294e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
void *data)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
uint32_t next_index;
int last_error = 0;
int skipped;
@@ -137,6 +137,7 @@ xfs_qm_dqpurge(
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -145,8 +146,7 @@ xfs_qm_dqpurge(
}
/*
- * If this quota has a group hint attached, prepare for releasing it
- * now.
+ * If this quota has a hint attached, prepare for releasing it now.
*/
gdqp = dqp->q_gdquot;
if (gdqp) {
@@ -154,6 +154,12 @@ xfs_qm_dqpurge(
dqp->q_gdquot = NULL;
}
+ pdqp = dqp->q_pdquot;
+ if (pdqp) {
+ xfs_dqlock(pdqp);
+ dqp->q_pdquot = NULL;
+ }
+
dqp->dq_flags |= XFS_DQ_FREEING;
xfs_dqflock(dqp);
@@ -189,7 +195,7 @@ xfs_qm_dqpurge(
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -208,6 +214,8 @@ xfs_qm_dqpurge(
if (gdqp)
xfs_qm_dqput(gdqp);
+ if (pdqp)
+ xfs_qm_dqput(pdqp);
return 0;
}
@@ -299,8 +307,10 @@ xfs_qm_mount_quotas(
*/
if (!XFS_IS_UQUOTA_ON(mp))
mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
- mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+ if (!XFS_IS_GQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+ if (!XFS_IS_PQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_PQUOTA_CHKD;
write_changes:
/*
@@ -362,6 +372,10 @@ xfs_qm_unmount_quotas(
IRELE(mp->m_quotainfo->qi_gquotaip);
mp->m_quotainfo->qi_gquotaip = NULL;
}
+ if (mp->m_quotainfo->qi_pquotaip) {
+ IRELE(mp->m_quotainfo->qi_pquotaip);
+ mp->m_quotainfo->qi_pquotaip = NULL;
+ }
}
}
@@ -408,7 +422,10 @@ xfs_qm_dqattach_one(
* be reclaimed as long as we have a ref from inode and we
* hold the ilock.
*/
- dqp = udqhint->q_gdquot;
+ if (type == XFS_DQ_GROUP)
+ dqp = udqhint->q_gdquot;
+ else
+ dqp = udqhint->q_pdquot;
if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
ASSERT(*IO_idqpp == NULL);
@@ -451,28 +468,42 @@ xfs_qm_dqattach_one(
/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups.
+ * Given a udquot and group/project type, attach the group/project
+ * dquot pointer to the udquot as a hint for future lookups.
*/
STATIC void
-xfs_qm_dqattach_grouphint(
- xfs_dquot_t *udq,
- xfs_dquot_t *gdq)
+xfs_qm_dqattach_hint(
+ struct xfs_inode *ip,
+ int type)
{
- xfs_dquot_t *tmp;
+ struct xfs_dquot **dqhintp;
+ struct xfs_dquot *dqp;
+ struct xfs_dquot *udq = ip->i_udquot;
+
+ ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
xfs_dqlock(udq);
- tmp = udq->q_gdquot;
- if (tmp) {
- if (tmp == gdq)
+ if (type == XFS_DQ_GROUP) {
+ dqp = ip->i_gdquot;
+ dqhintp = &udq->q_gdquot;
+ } else {
+ dqp = ip->i_pdquot;
+ dqhintp = &udq->q_pdquot;
+ }
+
+ if (*dqhintp) {
+ struct xfs_dquot *tmp;
+
+ if (*dqhintp == dqp)
goto done;
- udq->q_gdquot = NULL;
+ tmp = *dqhintp;
+ *dqhintp = NULL;
xfs_qm_dqrele(tmp);
}
- udq->q_gdquot = xfs_qm_dqhold(gdq);
+ *dqhintp = xfs_qm_dqhold(dqp);
done:
xfs_dqunlock(udq);
}
@@ -489,8 +520,7 @@ xfs_qm_need_dqattach(
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
return false;
- if (ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return false;
return true;
}
@@ -526,12 +556,8 @@ xfs_qm_dqattach_locked(
}
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_OQUOTA_ON(mp)) {
- error = XFS_IS_GQUOTA_ON(mp) ?
- xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot) :
- xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
ip->i_udquot, &ip->i_gdquot);
/*
@@ -543,14 +569,28 @@ xfs_qm_dqattach_locked(
nquotas++;
}
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ flags & XFS_QMOPT_DQALLOC,
+ ip->i_udquot, &ip->i_pdquot);
+ /*
+ * Don't worry about the udquot that we may have
+ * attached above. It'll get detached, if not already.
+ */
+ if (error)
+ goto done;
+ nquotas++;
+ }
+
/*
- * Attach this group quota to the user quota as a hint.
+ * Attach this group/project quota to the user quota as a hint.
* This WON'T, in general, result in a thrash.
*/
- if (nquotas == 2) {
+ if (nquotas > 1 && ip->i_udquot) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_udquot);
- ASSERT(ip->i_gdquot);
+ ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
+ ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
/*
* We do not have i_udquot locked at this point, but this check
@@ -559,7 +599,10 @@ xfs_qm_dqattach_locked(
* succeed in general.
*/
if (ip->i_udquot->q_gdquot != ip->i_gdquot)
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+ xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
+
+ if (ip->i_udquot->q_pdquot != ip->i_pdquot)
+ xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
}
done:
@@ -567,8 +610,10 @@ xfs_qm_dqattach_locked(
if (!error) {
if (XFS_IS_UQUOTA_ON(mp))
ASSERT(ip->i_udquot);
- if (XFS_IS_OQUOTA_ON(mp))
+ if (XFS_IS_GQUOTA_ON(mp))
ASSERT(ip->i_gdquot);
+ if (XFS_IS_PQUOTA_ON(mp))
+ ASSERT(ip->i_pdquot);
}
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
#endif
@@ -601,13 +646,12 @@ void
xfs_qm_dqdetach(
xfs_inode_t *ip)
{
- if (!(ip->i_udquot || ip->i_gdquot))
+ if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
return;
trace_xfs_dquot_dqdetach(ip);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
if (ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
@@ -616,6 +660,10 @@ xfs_qm_dqdetach(
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if (ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
}
int
@@ -660,6 +708,7 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
INIT_LIST_HEAD(&qinf->qi_lru_list);
@@ -761,6 +810,10 @@ xfs_qm_destroy_quotainfo(
IRELE(qi->qi_gquotaip);
qi->qi_gquotaip = NULL;
}
+ if (qi->qi_pquotaip) {
+ IRELE(qi->qi_pquotaip);
+ qi->qi_pquotaip = NULL;
+ }
mutex_destroy(&qi->qi_quotaofflock);
kmem_free(qi);
mp->m_quotainfo = NULL;
@@ -781,6 +834,36 @@ xfs_qm_qino_alloc(
int error;
int committed;
+ *ip = NULL;
+ /*
+ * With superblock that doesn't have separate pquotino, we
+ * share an inode between gquota and pquota. If the on-disk
+ * superblock has GQUOTA and the filesystem is now mounted
+ * with PQUOTA, just use sb_gquotino for sb_pquotino and
+ * vice-versa.
+ */
+ if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+ (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+ xfs_ino_t ino = NULLFSINO;
+
+ if ((flags & XFS_QMOPT_PQUOTA) &&
+ (mp->m_sb.sb_gquotino != NULLFSINO)) {
+ ino = mp->m_sb.sb_gquotino;
+ ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+ } else if ((flags & XFS_QMOPT_GQUOTA) &&
+ (mp->m_sb.sb_pquotino != NULLFSINO)) {
+ ino = mp->m_sb.sb_pquotino;
+ ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+ }
+ if (ino != NULLFSINO) {
+ error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+ if (error)
+ return error;
+ mp->m_sb.sb_gquotino = NULLFSINO;
+ mp->m_sb.sb_pquotino = NULLFSINO;
+ }
+ }
+
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
if ((error = xfs_trans_reserve(tp,
XFS_QM_QINOCREATE_SPACE_RES(mp),
@@ -791,11 +874,14 @@ xfs_qm_qino_alloc(
return error;
}
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
- if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
- return error;
+ if (!*ip) {
+ error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
+ &committed);
+ if (error) {
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT);
+ return error;
+ }
}
/*
@@ -807,21 +893,25 @@ xfs_qm_qino_alloc(
if (flags & XFS_QMOPT_SBVERSION) {
ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
- (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
+ (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+ XFS_SB_QFLAGS));
xfs_sb_version_addquota(&mp->m_sb);
mp->m_sb.sb_uquotino = NULLFSINO;
mp->m_sb.sb_gquotino = NULLFSINO;
+ mp->m_sb.sb_pquotino = NULLFSINO;
- /* qflags will get updated _after_ quotacheck */
- mp->m_sb.sb_qflags = 0;
+ /* qflags will get updated fully _after_ quotacheck */
+ mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
}
if (flags & XFS_QMOPT_UQUOTA)
mp->m_sb.sb_uquotino = (*ip)->i_ino;
- else
+ else if (flags & XFS_QMOPT_GQUOTA)
mp->m_sb.sb_gquotino = (*ip)->i_ino;
+ else
+ mp->m_sb.sb_pquotino = (*ip)->i_ino;
spin_unlock(&mp->m_sb_lock);
xfs_mod_sb(tp, sbfields);
@@ -1152,7 +1242,7 @@ xfs_qm_dqusage_adjust(
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ if (xfs_is_quota_inode(&mp->m_sb, ino)) {
*res = BULKSTAT_RV_NOTHING;
return XFS_ERROR(EINVAL);
}
@@ -1262,19 +1352,21 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
- xfs_inode_t *uip, *gip;
- uint flags;
- LIST_HEAD (buffer_list);
+ int done, count, error, error2;
+ xfs_ino_t lastino;
+ size_t structsz;
+ uint flags;
+ LIST_HEAD (buffer_list);
+ struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
+ struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
+ struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
count = INT_MAX;
structsz = 1;
lastino = 0;
flags = 0;
- ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+ ASSERT(uip || gip || pip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1376,6 @@ xfs_qm_quotacheck(
* their counters to zero. We need a clean slate.
* We don't log our changes till later.
*/
- uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
@@ -1293,14 +1384,20 @@ xfs_qm_quotacheck(
flags |= XFS_UQUOTA_CHKD;
}
- gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
- error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
+ error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+ &buffer_list);
+ if (error)
+ goto error_return;
+ flags |= XFS_GQUOTA_CHKD;
+ }
+
+ if (pip) {
+ error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
&buffer_list);
if (error)
goto error_return;
- flags |= XFS_OQUOTA_CHKD;
+ flags |= XFS_PQUOTA_CHKD;
}
do {
@@ -1395,15 +1492,14 @@ STATIC int
xfs_qm_init_quotainos(
xfs_mount_t *mp)
{
- xfs_inode_t *uip, *gip;
- int error;
- __int64_t sbflags;
- uint flags;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ int error;
+ __int64_t sbflags = 0;
+ uint flags = 0;
ASSERT(mp->m_quotainfo);
- uip = gip = NULL;
- sbflags = 0;
- flags = 0;
/*
* Get the uquota and gquota inodes
@@ -1412,57 +1508,80 @@ xfs_qm_init_quotainos(
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip)))
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip);
+ if (error)
return XFS_ERROR(error);
}
- if (XFS_IS_OQUOTA_ON(mp) &&
+ if (XFS_IS_GQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip))) {
- if (uip)
- IRELE(uip);
- return XFS_ERROR(error);
- }
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip);
+ if (error)
+ goto error_rele;
+ }
+ if (XFS_IS_PQUOTA_ON(mp) &&
+ mp->m_sb.sb_pquotino != NULLFSINO) {
+ ASSERT(mp->m_sb.sb_pquotino > 0);
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+ 0, 0, &pip);
+ if (error)
+ goto error_rele;
}
} else {
flags |= XFS_QMOPT_SBVERSION;
sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+ XFS_SB_QFLAGS);
}
/*
- * Create the two inodes, if they don't exist already. The changes
+ * Create the three inodes, if they don't exist already. The changes
* made above will get added to a transaction and logged in one of
* the qino_alloc calls below. If the device is readonly,
* temporarily switch to read-write to do this.
*/
if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
- if ((error = xfs_qm_qino_alloc(mp, &uip,
+ error = xfs_qm_qino_alloc(mp, &uip,
sbflags | XFS_SB_UQUOTINO,
- flags | XFS_QMOPT_UQUOTA)))
- return XFS_ERROR(error);
+ flags | XFS_QMOPT_UQUOTA);
+ if (error)
+ goto error_rele;
flags &= ~XFS_QMOPT_SBVERSION;
}
- if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
- flags |= (XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
error = xfs_qm_qino_alloc(mp, &gip,
- sbflags | XFS_SB_GQUOTINO, flags);
- if (error) {
- if (uip)
- IRELE(uip);
+ sbflags | XFS_SB_GQUOTINO,
+ flags | XFS_QMOPT_GQUOTA);
+ if (error)
+ goto error_rele;
- return XFS_ERROR(error);
- }
+ flags &= ~XFS_QMOPT_SBVERSION;
+ }
+ if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+ error = xfs_qm_qino_alloc(mp, &pip,
+ sbflags | XFS_SB_PQUOTINO,
+ flags | XFS_QMOPT_PQUOTA);
+ if (error)
+ goto error_rele;
}
mp->m_quotainfo->qi_uquotaip = uip;
mp->m_quotainfo->qi_gquotaip = gip;
+ mp->m_quotainfo->qi_pquotaip = pip;
return 0;
+
+error_rele:
+ if (uip)
+ IRELE(uip);
+ if (gip)
+ IRELE(gip);
+ if (pip)
+ IRELE(pip);
+ return XFS_ERROR(error);
}
STATIC void
@@ -1473,7 +1592,7 @@ xfs_qm_dqfree_one(
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -1656,10 +1775,13 @@ xfs_qm_vop_dqalloc(
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
- struct xfs_dquot **O_gdqpp)
+ struct xfs_dquot **O_gdqpp,
+ struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_dquot *uq, *gq;
+ struct xfs_dquot *uq = NULL;
+ struct xfs_dquot *gq = NULL;
+ struct xfs_dquot *pq = NULL;
int error;
uint lockflags;
@@ -1684,7 +1806,6 @@ xfs_qm_vop_dqalloc(
}
}
- uq = gq = NULL;
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
if (ip->i_d.di_uid != uid) {
/*
@@ -1697,11 +1818,12 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
XFS_DQ_USER,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &uq))) {
+ &uq);
+ if (error) {
ASSERT(error != ENOENT);
return error;
}
@@ -1723,15 +1845,14 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
XFS_DQ_GROUP,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &gq);
+ if (error) {
ASSERT(error != ENOENT);
- return error;
+ goto error_rele;
}
xfs_dqunlock(gq);
lockflags = XFS_ILOCK_SHARED;
@@ -1740,25 +1861,25 @@ xfs_qm_vop_dqalloc(
ASSERT(ip->i_gdquot);
gq = xfs_qm_dqhold(ip->i_gdquot);
}
- } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ }
+ if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
XFS_DQ_PROJ,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &pq);
+ if (error) {
ASSERT(error != ENOENT);
- return (error);
+ goto error_rele;
}
- xfs_dqunlock(gq);
+ xfs_dqunlock(pq);
lockflags = XFS_ILOCK_SHARED;
xfs_ilock(ip, lockflags);
} else {
- ASSERT(ip->i_gdquot);
- gq = xfs_qm_dqhold(ip->i_gdquot);
+ ASSERT(ip->i_pdquot);
+ pq = xfs_qm_dqhold(ip->i_pdquot);
}
}
if (uq)
@@ -1773,7 +1894,18 @@ xfs_qm_vop_dqalloc(
*O_gdqpp = gq;
else if (gq)
xfs_qm_dqrele(gq);
+ if (O_pdqpp)
+ *O_pdqpp = pq;
+ else if (pq)
+ xfs_qm_dqrele(pq);
return 0;
+
+error_rele:
+ if (gq)
+ xfs_qm_dqrele(gq);
+ if (uq)
+ xfs_qm_dqrele(uq);
+ return error;
}
/*
@@ -1821,29 +1953,34 @@ xfs_qm_vop_chown(
*/
int
xfs_qm_vop_chown_reserve(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ uint flags)
{
- xfs_mount_t *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
- xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ uint delblks, blkflags, prjflags = 0;
+ struct xfs_dquot *udq_unres = NULL;
+ struct xfs_dquot *gdq_unres = NULL;
+ struct xfs_dquot *pdq_unres = NULL;
+ struct xfs_dquot *udq_delblks = NULL;
+ struct xfs_dquot *gdq_delblks = NULL;
+ struct xfs_dquot *pdq_delblks = NULL;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
delblks = ip->i_delayed_blks;
- delblksudq = delblksgdq = unresudq = unresgdq = NULL;
blkflags = XFS_IS_REALTIME_INODE(ip) ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
- delblksudq = udqp;
+ udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
* unreserve those from the old dquot, and add them to the
@@ -1851,29 +1988,34 @@ xfs_qm_vop_chown_reserve(
*/
if (delblks) {
ASSERT(ip->i_udquot);
- unresudq = ip->i_udquot;
+ udq_unres = ip->i_udquot;
}
}
- if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
- if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
- xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
- prjflags = XFS_QMOPT_ENOSPC;
-
- if (prjflags ||
- (XFS_IS_GQUOTA_ON(ip->i_mount) &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
- delblksgdq = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- unresgdq = ip->i_gdquot;
- }
+ if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+ ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ gdq_delblks = gdqp;
+ if (delblks) {
+ ASSERT(ip->i_gdquot);
+ gdq_unres = ip->i_gdquot;
+ }
+ }
+
+ if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
+ xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+ prjflags = XFS_QMOPT_ENOSPC;
+ pdq_delblks = pdqp;
+ if (delblks) {
+ ASSERT(ip->i_pdquot);
+ pdq_unres = ip->i_pdquot;
}
}
- if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags)))
- return (error);
+ error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ ip->i_d.di_nblocks, 1,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
/*
* Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,15 +2027,17 @@ xfs_qm_vop_chown_reserve(
/*
* Do the reservations first. Unreservation can't fail.
*/
- ASSERT(delblksudq || delblksgdq);
- ASSERT(unresudq || unresgdq);
- if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags)))
- return (error);
+ ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
+ ASSERT(udq_unres || gdq_unres || pdq_unres);
+ error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ (xfs_qcnt_t)delblks, 0,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
- blkflags);
+ udq_unres, gdq_unres, pdq_unres,
+ -((xfs_qcnt_t)delblks), 0, blkflags);
}
return (0);
@@ -1932,7 +2076,8 @@ xfs_qm_vop_create_dqattach(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp)
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1952,13 +2097,18 @@ xfs_qm_vop_create_dqattach(
}
if (gdqp) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(XFS_IS_OQUOTA_ON(mp));
- ASSERT((XFS_IS_GQUOTA_ON(mp) ?
- ip->i_d.di_gid : xfs_get_projid(ip)) ==
- be32_to_cpu(gdqp->q_core.d_id));
-
+ ASSERT(XFS_IS_GQUOTA_ON(mp));
+ ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+ if (pdqp) {
+ ASSERT(ip->i_pdquot == NULL);
+ ASSERT(XFS_IS_PQUOTA_ON(mp));
+ ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+
+ ip->i_pdquot = xfs_qm_dqhold(pdqp);
+ xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+ }
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..579d6a02a5b6 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -44,9 +44,11 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
typedef struct xfs_quotainfo {
struct radix_tree_root qi_uquota_tree;
struct radix_tree_root qi_gquota_tree;
+ struct radix_tree_root qi_pquota_tree;
struct mutex qi_tree_lock;
- xfs_inode_t *qi_uquotaip; /* user quota inode */
- xfs_inode_t *qi_gquotaip; /* group quota inode */
+ struct xfs_inode *qi_uquotaip; /* user quota inode */
+ struct xfs_inode *qi_gquotaip; /* group quota inode */
+ struct xfs_inode *qi_pquotaip; /* project quota inode */
struct list_head qi_lru_list;
struct mutex qi_lru_lock;
int qi_lru_count;
@@ -69,30 +71,66 @@ typedef struct xfs_quotainfo {
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
-#define XFS_DQUOT_TREE(qi, type) \
- ((type & XFS_DQ_USER) ? \
- &((qi)->qi_uquota_tree) : \
- &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+ struct xfs_quotainfo *qi,
+ int type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return &qi->qi_uquota_tree;
+ case XFS_DQ_GROUP:
+ return &qi->qi_gquota_tree;
+ case XFS_DQ_PROJ:
+ return &qi->qi_pquota_tree;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+ switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ case XFS_DQ_GROUP:
+ return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ case XFS_DQ_PROJ:
+ return dqp->q_mount->m_quotainfo->qi_pquotaip;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
unsigned int nbblks);
-extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
- xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern void xfs_trans_mod_dquot(struct xfs_trans *,
+ struct xfs_dquot *, uint, long);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+ struct xfs_mount *, struct xfs_dquot *,
+ struct xfs_dquot *, struct xfs_dquot *,
+ long, long, uint);
+extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
+ * We keep the usr, grp, and prj dquots separately so that locking will be
+ * easier to do at commit time. All transactions that we know of at this point
* affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
*/
+enum {
+ XFS_QM_TRANS_USR = 0,
+ XFS_QM_TRANS_GRP,
+ XFS_QM_TRANS_PRJ,
+ XFS_QM_TRANS_DQTYPES
+};
#define XFS_QM_TRANS_MAXDQS 2
-typedef struct xfs_dquot_acct {
- xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
- xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+ struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
/*
* Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +144,23 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_IWARNLIMIT 5
#define XFS_QM_RTBWARNLIMIT 5
-extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_quotacheck(xfs_mount_t *);
-extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int xfs_qm_quotacheck(struct xfs_mount *);
+extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
/* dquot stuff */
-extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
-extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
+extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct fs_disk_quota *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+ struct fs_disk_quota *);
+extern int xfs_qm_scall_getqstat(struct xfs_mount *,
+ struct fs_quota_stat *);
+extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2d02eac1c9a8..437a52d91f6d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -112,16 +112,16 @@ xfs_qm_newmount(
if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
(!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
(gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
+ (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
+ (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
xfs_dev_is_read_only(mp, "changing quota state")) {
xfs_warn(mp, "please mount with%s%s%s%s.",
(!quotaondisk ? "out quota" : ""),
(uquotaondisk ? " usrquota" : ""),
- (pquotaondisk ? " prjquota" : ""),
- (gquotaondisk ? " grpquota" : ""));
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return XFS_ERROR(EPERM);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..8d9e4c78e1ab 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,12 @@ xfs_qm_scall_quotaoff(
}
if (flags & XFS_GQUOTA_ACCT) {
dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
inactivate_flags |= XFS_GQUOTA_ACTIVE;
- } else if (flags & XFS_PQUOTA_ACCT) {
+ }
+ if (flags & XFS_PQUOTA_ACCT) {
dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
inactivate_flags |= XFS_PQUOTA_ACTIVE;
}
@@ -198,10 +199,9 @@ xfs_qm_scall_quotaoff(
}
/*
- * If quotas is completely disabled, close shop.
+ * If all quotas are completely turned off, close shop.
*/
- if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
- ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+ if (mp->m_qflags == 0) {
mutex_unlock(&q->qi_quotaofflock);
xfs_qm_destroy_quotainfo(mp);
return (0);
@@ -214,10 +214,14 @@ xfs_qm_scall_quotaoff(
IRELE(q->qi_uquotaip);
q->qi_uquotaip = NULL;
}
- if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+ if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
IRELE(q->qi_gquotaip);
q->qi_gquotaip = NULL;
}
+ if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
+ IRELE(q->qi_pquotaip);
+ q->qi_pquotaip = NULL;
+ }
out_unlock:
mutex_unlock(&q->qi_quotaofflock);
@@ -292,8 +296,10 @@ xfs_qm_scall_trunc_qfiles(
if (flags & XFS_DQ_USER)
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
- if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+ if (flags & XFS_DQ_GROUP)
error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ if (flags & XFS_DQ_PROJ)
+ error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
return error ? error : error2;
}
@@ -335,14 +341,14 @@ xfs_qm_scall_quotaon(
* quota acct on ondisk without m_qflags' knowing.
*/
if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
- (flags & XFS_UQUOTA_ENFD))
- ||
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ (flags & XFS_UQUOTA_ENFD)) ||
+ ((flags & XFS_GQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ (flags & XFS_GQUOTA_ENFD)) ||
((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
- (flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))) {
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ (flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
__func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +413,13 @@ xfs_qm_scall_getqstat(
struct fs_quota_stat *out)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip, *gip;
- bool tempuqip, tempgqip;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ bool tempuqip = false;
+ bool tempgqip = false;
+ bool temppqip = false;
- uip = gip = NULL;
- tempuqip = tempgqip = false;
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
@@ -420,16 +428,14 @@ xfs_qm_scall_getqstat(
out->qs_gquota.qfs_ino = NULLFSINO;
return (0);
}
+
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
(XFS_ALL_QUOTA_ACCT|
XFS_ALL_QUOTA_ENFD));
- out->qs_pad = 0;
- out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
- out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-
if (q) {
uip = q->qi_uquotaip;
gip = q->qi_gquotaip;
+ pip = q->qi_pquotaip;
}
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,18 +447,41 @@ xfs_qm_scall_getqstat(
0, 0, &gip) == 0)
tempgqip = true;
}
+ /*
+ * Q_XGETQSTAT doesn't have room for both group and project quotas.
+ * So, allow the project quota values to be copied out only if
+ * there is no group quota information available.
+ */
+ if (!gip) {
+ if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+ 0, 0, &pip) == 0)
+ temppqip = true;
+ }
+ } else
+ pip = NULL;
if (uip) {
+ out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
if (tempuqip)
IRELE(uip);
}
+
if (gip) {
+ out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
if (tempgqip)
IRELE(gip);
}
+ if (pip) {
+ out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+ out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
+ out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
+ if (temppqip)
+ IRELE(pip);
+ }
if (q) {
out->qs_incoredqs = q->qi_dquots;
out->qs_btimelimit = q->qi_btimelimit;
@@ -776,9 +805,12 @@ xfs_qm_scall_getquota(
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_USER) ||
+ (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+ (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_PROJ)) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -786,8 +818,8 @@ xfs_qm_scall_getquota(
#ifdef DEBUG
if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
- (XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+ (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+ (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
dst->d_id != 0) {
if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
@@ -833,16 +865,16 @@ xfs_qm_export_flags(
uflags = 0;
if (flags & XFS_UQUOTA_ACCT)
uflags |= FS_QUOTA_UDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_GQUOTA_ACCT)
uflags |= FS_QUOTA_GDQ_ACCT;
+ if (flags & XFS_PQUOTA_ACCT)
+ uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_UQUOTA_ENFD)
uflags |= FS_QUOTA_UDQ_ENFD;
- if (flags & (XFS_OQUOTA_ENFD)) {
- uflags |= (flags & XFS_GQUOTA_ACCT) ?
- FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
- }
+ if (flags & XFS_GQUOTA_ENFD)
+ uflags |= FS_QUOTA_GDQ_ENFD;
+ if (flags & XFS_PQUOTA_ENFD)
+ uflags |= FS_QUOTA_PDQ_ENFD;
return (uflags);
}
@@ -856,9 +888,11 @@ xfs_dqrele_inode(
{
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+ ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
+ ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
+ ASSERT(ip->i_pdquot == NULL);
return 0;
}
@@ -867,10 +901,14 @@ xfs_dqrele_inode(
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
}
- if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..b14f42c714b6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,11 +108,28 @@ typedef struct xfs_dqblk {
{ XFS_DQ_FREEING, "FREEING" }
/*
- * In the worst case, when both user and group quotas are on,
- * we can have a max of three dquots changing in a single transaction.
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
*/
-#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3)
-
+#define XFS_DQUOT_LOGRES(mp) \
+ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
/*
* These are the structures used to lay out dquots and quotaoff
@@ -161,30 +178,42 @@ typedef struct xfs_qoff_logformat {
#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
+
+/*
* Quota Accounting/Enforcement flags
*/
#define XFS_ALL_QUOTA_ACCT \
(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ENFD \
+ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD \
+ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
/*
* Incore only flags for quotaoff - these bits get cleared when quota(s)
* are in the process of getting turned off. These flags are in m_qflags but
* never in sb_qflags.
*/
-#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+ (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
/*
* Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,33 +288,24 @@ typedef struct xfs_qoff_logformat {
* we didn't have the inode locked, the appropriate dquot(s) will be
* attached atomically.
*/
-#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
- (ip)->i_udquot == NULL) || \
- (XFS_IS_OQUOTA_ON(mp) && \
- (ip)->i_gdquot == NULL))
+#define XFS_NOT_DQATTACHED(mp, ip) \
+ ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
+ (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
+ (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
#define XFS_QM_NEED_QUOTACHECK(mp) \
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
(XFS_IS_GQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
(XFS_IS_PQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
-
-#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
-
-#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
- XFS_GQUOTA_ACCT)
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+ XFS_PQUOTA_CHKD)
/*
@@ -318,17 +338,18 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
struct xfs_inode *, long, long, uint);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, long, long, uint);
+ struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
- struct xfs_dquot **, struct xfs_dquot **);
+ struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *);
+ struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *, uint);
+ struct xfs_dquot *, struct xfs_dquot *,
+ struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *, uint);
extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -342,10 +363,12 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
- uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+ uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
+ struct xfs_dquot **pdqp)
{
*udqp = NULL;
*gdqp = NULL;
+ *pdqp = NULL;
return 0;
}
#define xfs_trans_dup_dqinfo(tp, tp2)
@@ -360,14 +383,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
}
static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
struct xfs_mount *mp, struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ long nblks, long nions, uint flags)
{
return 0;
}
-#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip, fl) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
@@ -381,8 +405,8 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
+ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
f | XFS_QMOPT_RES_REGBLKS)
extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
flags |= XFS_GQUOTA_ACCT;
if (uflags & FS_QUOTA_UDQ_ENFD)
flags |= XFS_UQUOTA_ENFD;
- if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ flags |= XFS_GQUOTA_ENFD;
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ flags |= XFS_PQUOTA_ENFD;
switch (op) {
case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..a6ff9d6e72f6 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,10 +618,23 @@ xfs_sb_has_incompat_log_feature(
return (sbp->sb_features_log_incompat & feature) != 0;
}
+static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
/*
* end of superblock version macros
*/
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino ||
+ ino == sbp->sb_gquotino ||
+ ino == sbp->sb_pquotino);
+}
+
#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..19922ebeea25 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
!strcmp(this_char, MNTOPT_PRJQUOTA)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_PQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
!strcmp(this_char, MNTOPT_GRPQUOTA)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_GQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
xfs_warn(mp,
"delaylog is the default now, option is deprecated.");
@@ -420,12 +421,6 @@ xfs_parseargs(
}
#endif
- if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
- (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
- xfs_warn(mp, "cannot mount with both project and group quota");
- return EINVAL;
- }
-
if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
xfs_warn(mp, "sunit and swidth must be specified together");
return EINVAL;
@@ -439,20 +434,15 @@ xfs_parseargs(
}
done:
- if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
/*
* At this point the superblock has not been read
* in, therefore we do not know the block size.
* Before the mount call ends we will convert
* these to FSBs.
*/
- if (dsunit) {
- mp->m_dalign = dsunit;
- mp->m_flags |= XFS_MOUNT_RETERR;
- }
-
- if (dswidth)
- mp->m_swidth = dswidth;
+ mp->m_dalign = dsunit;
+ mp->m_swidth = dswidth;
}
if (mp->m_logbufs != -1 &&
@@ -560,15 +550,14 @@ xfs_showargs(
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, "," MNTOPT_UQUOTANOENF);
- /* Either project or group quotas can be active, not both */
-
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
seq_puts(m, "," MNTOPT_PRJQUOTA);
else
seq_puts(m, "," MNTOPT_PQUOTANOENF);
- } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ }
+ if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
seq_puts(m, "," MNTOPT_GRPQUOTA);
else
seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -874,17 +863,17 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -1136,8 +1125,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
- (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
return 0;
}
@@ -1400,6 +1389,14 @@ xfs_finish_flags(
return XFS_ERROR(EROFS);
}
+ if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+ (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
+ !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+ xfs_warn(mp,
+ "Super block does not support project and group quota together");
+ return XFS_ERROR(EINVAL);
+ }
+
return 0;
}
@@ -1481,6 +1478,10 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
+ /* version 5 superblocks support inode version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ sb->s_flags |= MS_I_VERSION;
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1655,9 +1656,15 @@ xfs_init_zones(void)
KM_ZONE_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
+ xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+ "xfs_icr");
+ if (!xfs_icreate_zone)
+ goto out_destroy_ili_zone;
return 0;
+ out_destroy_ili_zone:
+ kmem_zone_destroy(xfs_ili_zone);
out_destroy_inode_zone:
kmem_zone_destroy(xfs_inode_zone);
out_destroy_efi_zone:
@@ -1696,6 +1703,7 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_icreate_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..f4895b662fcb 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,9 @@ xfs_symlink(
int n;
xfs_buf_t *bp;
prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
uint resblks;
*ipp = NULL;
@@ -385,7 +387,7 @@ xfs_symlink(
* Make sure that we have allocated dquot(s) on disk.
*/
error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
if (error)
goto std_return;
@@ -426,7 +428,8 @@ xfs_symlink(
/*
* Reserve disk quota : blocks and inode.
*/
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
if (error)
goto error_return;
@@ -464,7 +467,7 @@ xfs_symlink(
/*
* Also attach the dquot(s) to it, if applicable.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
if (resblks)
resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -562,6 +565,7 @@ xfs_symlink(
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
*ipp = ip;
return 0;
@@ -575,6 +579,7 @@ xfs_symlink(
xfs_trans_cancel(tp, cancel_flags);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -585,7 +590,7 @@ xfs_symlink(
/*
* Free a symlink that has blocks associated with it.
*/
-int
+STATIC int
xfs_inactive_symlink_rmt(
xfs_inode_t *ip,
xfs_trans_t **tpp)
@@ -606,7 +611,7 @@ xfs_inactive_symlink_rmt(
tp = *tpp;
mp = ip->i_mount;
- ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+ ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
/*
* We're freeing a symlink that has some
* blocks allocated to it. Free the
@@ -720,3 +725,47 @@ xfs_inactive_symlink_rmt(
error0:
return error;
}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+ struct xfs_inode *ip,
+ struct xfs_trans **tp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int pathlen;
+
+ trace_xfs_inactive_symlink(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Zero length symlinks _can_ exist.
+ */
+ pathlen = (int)ip->i_d.di_size;
+ if (!pathlen)
+ return 0;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+ __func__, (unsigned long long)ip->i_ino, pathlen);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ if (ip->i_df.if_bytes > 0)
+ xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+ XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ return 0;
+ }
+
+ /* remove the remote symlink */
+ return xfs_inactive_symlink_rmt(ip, tp);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
#endif /* __KERNEL__ */
#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
#ifdef CONFIG_PROC_FS
STATIC int
xfs_stats_clear_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
STATIC int
xfs_panic_mask_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int ret, *valp = ctl->data;
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
{}
};
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
{
.procname = "xfs",
.mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
{}
};
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
TP_PROTO(struct xfs_buf_log_item *bip), \
TP_ARGS(bip))
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
DEFINE_INODE_EVENT(xfs_getattr);
DEFINE_INODE_EVENT(xfs_setattr);
DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
@@ -974,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
- TP_ARGS(inode, page, off),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
+ __field(unsigned int, length)
__field(int, delalloc)
__field(int, unwritten)
),
@@ -995,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
+ __entry->length = len;
__entry->delalloc = delalloc;
__entry->unwritten = unwritten;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "delalloc %d unwritten %d",
+ "length %x delalloc %d unwritten %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
+ __entry->length,
__entry->delalloc,
__entry->unwritten)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off))
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
}
/*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
* the parent directory inode: inode size
* the new inode: inode size
- * the inode btree entry: 1 block
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
* the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
* the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(1, 1024)),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_create_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
}
/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
+ * For icreate we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
*/
STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
struct xfs_mount *mp)
{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
- xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- (uint)XFS_FSB_TO_B(mp, 1) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- mp->m_sb.sb_sectsize +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(mp->m_in_maxlevels,
- XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1))));
+ MAX(xfs_calc_icreate_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return xfs_calc_icreate_reservation(mp);
+ return __xfs_calc_create_reservation(mp);
+
}
/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
return xfs_calc_create_reservation(mp);
}
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp) +
+ xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
/*
* In freeing an inode we can modify:
* the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_ICREATE 0x123f
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_SWAPEXT 40
#define XFS_TRANS_SB_COUNT 41
#define XFS_TRANS_CHECKPOINT 42
-#define XFS_TRANS_TYPE_MAX 42
+#define XFS_TRANS_ICREATE 43
+#define XFS_TRANS_TYPE_MAX 43
/* new transaction types need to be reflected in xfs_logprint(8) */
#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
/*
* Per-extent log reservation for the allocation btree changes
* involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
*/
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
/*
* Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
+ * dir blocks: (1 btree block per level + data block + free block)
+ * bmap btree: (levels + 2) * max depth
* v2 directory blocks can be fragmented below the dirblksize down to the fsb
* size, so account for that in the DAENTER macros.
*/
-#define XFS_DIROP_LOG_RES(mp) \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
#define XFS_DIROP_LOG_COUNT(mp) \
(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
return XFS_ERROR(EIO);
}
-
/*
* Release the buffer bp which was previously acquired with one of the
* xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
- bip->bli_flags |= XFS_BLI_LOGGED;
- xfs_buf_item_log(bip, first, last);
+
+ /*
+ * If we have an ordered buffer we are not logging any dirty range but
+ * it still needs to be marked dirty and that it has been logged.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ xfs_buf_item_log(bip, first, last);
}
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
}
/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
+}
+
+/*
* Set the type of the buffer for log recovery so that it can correctly identify
* and hence attach the correct buffer ops to the buffer after replay.
*/
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..61407a847b86 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
return;
xfs_trans_alloc_dqinfo(ntp);
- oqa = otp->t_dqinfo->dqa_usrdquots;
- nqa = ntp->t_dqinfo->dqa_usrdquots;
/*
* Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
- for (j = 0; j < 2; j++) {
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ oqa = otp->t_dqinfo->dqs[j];
+ nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (oqa[i].qt_dquot == NULL)
break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
oq->qt_ino_res = oq->qt_ino_res_used;
}
- oqa = otp->t_dqinfo->dqa_grpdquots;
- nqa = ntp->t_dqinfo->dqa_grpdquots;
}
}
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
if (!XFS_IS_QUOTA_RUNNING(mp) ||
!XFS_IS_QUOTA_ON(mp) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
if (tp->t_dqinfo == NULL)
@@ -166,20 +163,28 @@ xfs_trans_mod_dquot_byino(
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
- if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+ if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+ (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
}
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
xfs_trans_get_dqtrx(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
- int i;
- xfs_dqtrx_t *qa;
-
- qa = XFS_QM_ISUDQ(dqp) ?
- tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+ int i;
+ struct xfs_dqtrx *qa;
+
+ if (XFS_QM_ISUDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+ else if (XFS_QM_ISGDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+ else if (XFS_QM_ISPDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+ else
+ return NULL;
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -292,11 +297,10 @@ xfs_trans_mod_dquot(
/*
- * Given an array of dqtrx structures, lock all the dquots associated
- * and join them to the transaction, provided they have been modified.
- * We know that the highest number of dquots (of one type - usr OR grp),
- * involved in a transaction is 2 and that both usr and grp combined - 3.
- * So, we don't attempt to make this very generic.
+ * Given an array of dqtrx structures, lock all the dquots associated and join
+ * them to the transaction, provided they have been modified. We know that the
+ * highest number of dquots of one type - usr, grp OR prj - involved in a
+ * transaction is 2 so we don't need to make this very generic.
*/
STATIC void
xfs_trans_dqlockedjoin(
@@ -339,12 +343,10 @@ xfs_trans_apply_dquot_deltas(
return;
ASSERT(tp->t_dqinfo);
- qa = tp->t_dqinfo->dqa_usrdquots;
- for (j = 0; j < 2; j++) {
- if (qa[0].qt_dquot == NULL) {
- qa = tp->t_dqinfo->dqa_grpdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
+ if (qa[0].qt_dquot == NULL)
continue;
- }
/*
* Lock all of the dquots and join them to the transaction.
@@ -495,10 +497,6 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_res_rtbcount >=
be64_to_cpu(dqp->q_core.d_rtbcount));
}
- /*
- * Do the group quotas next
- */
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -521,9 +519,9 @@ xfs_trans_unreserve_and_mod_dquots(
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
- qa = tp->t_dqinfo->dqa_usrdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
- for (j = 0; j < 2; j++) {
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
@@ -565,7 +563,6 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqunlock(dqp);
}
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -640,8 +637,8 @@ xfs_trans_dqresv(
if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
dqp->q_core.d_id &&
((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
- (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+ (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+ (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
if (nblks > 0) {
/*
* dquot is locked already. See if we'd go over the
@@ -736,8 +733,8 @@ error_return:
/*
* Given dquot(s), make disk block and/or inode reservations against them.
- * The fact that this does the reservation against both the usr and
- * grp/prj quotas is important, because this follows a both-or-nothing
+ * The fact that this does the reservation against user, group and
+ * project quotas is important, because this follows a all-or-nothing
* approach.
*
* flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +745,16 @@ error_return:
*/
int
xfs_trans_reserve_quota_bydquots(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- long nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ long nblks,
+ long ninos,
+ uint flags)
{
- int resvd = 0, error;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
@@ -771,28 +769,34 @@ xfs_trans_reserve_quota_bydquots(
(flags & ~XFS_QMOPT_ENOSPC));
if (error)
return error;
- resvd = 1;
}
if (gdqp) {
error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
- if (error) {
- /*
- * can't do it, so backout previous reservation
- */
- if (resvd) {
- flags |= XFS_QMOPT_FORCE_RES;
- xfs_trans_dqresv(tp, mp, udqp,
- -nblks, -ninos, flags);
- }
- return error;
- }
+ if (error)
+ goto unwind_usr;
+ }
+
+ if (pdqp) {
+ error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
+ if (error)
+ goto unwind_grp;
}
/*
* Didn't change anything critical, so, no need to log
*/
return 0;
+
+unwind_grp:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (gdqp)
+ xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (udqp)
+ xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+ return error;
}
@@ -816,8 +820,7 @@ xfs_trans_reserve_quota_nblks(
if (XFS_IS_PQUOTA_ON(mp))
flags |= XFS_QMOPT_ENOSPC;
- ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +833,7 @@ xfs_trans_reserve_quota_nblks(
*/
return xfs_trans_reserve_quota_bydquots(tp, mp,
ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot,
nblks, ninos, flags);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /*
+ * First time we log the inode in a transaction, bump the inode change
+ * counter if it is configured for this to occur.
+ */
+ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ IS_I_VERSION(VFS_I(ip))) {
+ inode_inc_iversion(VFS_I(ip));
+ ip->i_d.di_changecount = VFS_I(ip)->i_version;
+ flags |= XFS_ILOG_CORE;
+ }
+
tp->t_flags |= XFS_TRANS_DIRTY;
ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..dc730ac272be 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
xfs_trans_ijoin(tp, ip, 0);
if (S_ISLNK(ip->i_d.di_mode)) {
- /*
- * Zero length symlinks _can_ exist.
- */
- if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
- error = xfs_inactive_symlink_rmt(ip, &tp);
- if (error)
- goto out_cancel;
- } else if (ip->i_df.if_bytes > 0) {
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- }
+ error = xfs_inactive_symlink(ip, &tp);
+ if (error)
+ goto out_cancel;
} else if (truncate) {
ip->i_d.di_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -498,6 +489,7 @@ xfs_create(
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
uint resblks;
uint log_res;
uint log_count;
@@ -516,7 +508,8 @@ xfs_create(
* Make sure that we have allocated dquot(s) on disk.
*/
error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -568,7 +561,8 @@ xfs_create(
/*
* Reserve disk quota and the inode.
*/
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
if (error)
goto out_trans_cancel;
@@ -632,7 +626,7 @@ xfs_create(
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
@@ -644,6 +638,7 @@ xfs_create(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
*ipp = ip;
return 0;
@@ -665,6 +660,7 @@ xfs_create(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1577,7 +1573,7 @@ xfs_free_file_space(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot,
+ ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
resblks, 0, XFS_QMOPT_RES_REGBLKS);
if (error)
goto error1;