summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig9
-rw-r--r--fs/9p/Makefile3
-rw-r--r--fs/9p/cache.c474
-rw-r--r--fs/9p/cache.h176
-rw-r--r--fs/9p/v9fs.c196
-rw-r--r--fs/9p/v9fs.h13
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c88
-rw-r--r--fs/9p/vfs_dir.c93
-rw-r--r--fs/9p/vfs_file.c25
-rw-r--r--fs/9p/vfs_inode.c66
-rw-r--r--fs/9p/vfs_super.c16
-rw-r--r--fs/Kconfig8
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/proc.c8
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/aio.c57
-rw-r--r--fs/anon_inodes.c70
-rw-r--r--fs/attr.c46
-rw-r--r--fs/autofs/dirhash.c2
-rw-r--r--fs/befs/linuxvfs.c9
-rw-r--r--fs/binfmt_elf.c124
-rw-r--r--fs/binfmt_elf_fdpic.c73
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/bio.c77
-rw-r--r--fs/block_dev.c174
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c329
-rw-r--r--fs/btrfs/async-thread.h18
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h114
-rw-r--r--fs/btrfs/dir-item.c47
-rw-r--r--fs/btrfs/disk-io.c285
-rw-r--r--fs/btrfs/export.c133
-rw-r--r--fs/btrfs/extent-tree.c2319
-rw-r--r--fs/btrfs/extent_io.c446
-rw-r--r--fs/btrfs/extent_io.h43
-rw-r--r--fs/btrfs/extent_map.c103
-rw-r--r--fs/btrfs/extent_map.h5
-rw-r--r--fs/btrfs/file.c118
-rw-r--r--fs/btrfs/free-space-cache.c38
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode-map.c93
-rw-r--r--fs/btrfs/inode.c1096
-rw-r--r--fs/btrfs/ioctl.c404
-rw-r--r--fs/btrfs/ioctl.h3
-rw-r--r--fs/btrfs/ordered-data.c133
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/relocation.c284
-rw-r--r--fs/btrfs/root-tree.c140
-rw-r--r--fs/btrfs/super.c14
-rw-r--r--fs/btrfs/transaction.c110
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c83
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c125
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c79
-rw-r--r--fs/ceph/Kconfig26
-rw-r--r--fs/ceph/Makefile37
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1115
-rw-r--r--fs/ceph/auth.c225
-rw-r--r--fs/ceph/auth.h77
-rw-r--r--fs/ceph/auth_none.c120
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/buffer.c34
-rw-r--r--fs/ceph/buffer.h55
-rw-r--r--fs/ceph/caps.c2863
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h647
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/debugfs.c450
-rw-r--r--fs/ceph/decode.h159
-rw-r--r--fs/ceph/dir.c1214
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c904
-rw-r--r--fs/ceph/inode.c1624
-rw-r--r--fs/ceph/ioctl.c157
-rw-r--r--fs/ceph/ioctl.h39
-rw-r--r--fs/ceph/mds_client.c2976
-rw-r--r--fs/ceph/mds_client.h327
-rw-r--r--fs/ceph/mdsmap.c170
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2103
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c751
-rw-r--r--fs/ceph/mon_client.h115
-rw-r--r--fs/ceph/msgpool.c181
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h167
-rw-r--r--fs/ceph/osd_client.c1363
-rw-r--r--fs/ceph/osd_client.h150
-rw-r--r--fs/ceph/osdmap.c916
-rw-r--r--fs/ceph/osdmap.h124
-rw-r--r--fs/ceph/rados.h370
-rw-r--r--fs/ceph/snap.c887
-rw-r--r--fs/ceph/super.c984
-rw-r--r--fs/ceph/super.h895
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c842
-rw-r--r--fs/char_dev.c43
-rw-r--r--fs/cifs/CHANGES14
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c108
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h30
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c317
-rw-r--r--fs/cifs/connect.c61
-rw-r--r--fs/cifs/dir.c77
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c180
-rw-r--r--fs/cifs/inode.c66
-rw-r--r--fs/cifs/misc.c48
-rw-r--r--fs/cifs/readdir.c11
-rw-r--r--fs/cifs/transport.c51
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c33
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c12
-rw-r--r--fs/dlm/lowcomms.c62
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/ecryptfs/Kconfig5
-rw-r--r--fs/ecryptfs/crypto.c39
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c99
-rw-r--r--fs/ecryptfs/keystore.c39
-rw-r--r--fs/ecryptfs/kthread.c24
-rw-r--r--fs/ecryptfs/main.c10
-rw-r--r--fs/ecryptfs/mmap.c6
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/eventfd.c67
-rw-r--r--fs/exec.c133
-rw-r--r--fs/exofs/super.c6
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext2/namei.c6
-rw-r--r--fs/ext2/super.c184
-rw-r--r--fs/ext2/xip.c7
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c40
-rw-r--r--fs/ext3/inode.c67
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c457
-rw-r--r--fs/ext3/xattr.c7
-rw-r--r--fs/ext4/Kconfig23
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c48
-rw-r--r--fs/ext4/block_validity.c3
-rw-r--r--fs/ext4/ext4.h159
-rw-r--r--fs/ext4/ext4_extents.h11
-rw-r--r--fs/ext4/ext4_jbd2.c87
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c594
-rw-r--r--fs/ext4/file.c57
-rw-r--r--fs/ext4/fsync.c24
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c824
-rw-r--r--fs/ext4/ioctl.c8
-rw-r--r--fs/ext4/mballoc.c808
-rw-r--r--fs/ext4/mballoc.h57
-rw-r--r--fs/ext4/migrate.c47
-rw-r--r--fs/ext4/move_extent.c581
-rw-r--r--fs/ext4/namei.c43
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c317
-rw-r--r--fs/ext4/xattr.c30
-rw-r--r--fs/fat/fat.h5
-rw-r--r--fs/fat/fatent.c25
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/inode.c42
-rw-r--r--fs/fat/misc.c12
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c1197
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/dir.c18
-rw-r--r--fs/fuse/file.c7
-rw-r--r--fs/fuse/fuse_i.h20
-rw-r--r--fs/fuse/inode.c94
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c385
-rw-r--r--fs/gfs2/acl.h24
-rw-r--r--fs/gfs2/aops.c23
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/dir.c34
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/glock.c10
-rw-r--r--fs/gfs2/glock.h9
-rw-r--r--fs/gfs2/glops.c5
-rw-r--r--fs/gfs2/incore.h20
-rw-r--r--fs/gfs2/inode.c163
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/ops_fstype.c218
-rw-r--r--fs/gfs2/ops_inode.c83
-rw-r--r--fs/gfs2/quota.c393
-rw-r--r--fs/gfs2/quota.h5
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c96
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c152
-rw-r--r--fs/gfs2/super.h9
-rw-r--r--fs/gfs2/sys.c45
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)445
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)50
-rw-r--r--fs/hfs/btree.c5
-rw-r--r--fs/hfs/mdb.c6
-rw-r--r--fs/hfsplus/super.c6
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/hugetlbfs/inode.c49
-rw-r--r--fs/inode.c128
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c11
-rw-r--r--fs/isofs/compress.c533
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/isofs/rock.c3
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c33
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c73
-rw-r--r--fs/jbd2/journal.c215
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/background.c20
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jfs/acl.c7
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/libfs.c13
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/host.c18
-rw-r--r--fs/lockd/mon.c46
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/locks.c6
-rw-r--r--fs/minix/dir.c22
-rw-r--r--fs/namei.c97
-rw-r--r--fs/namespace.c77
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/ncpfs/ioctl.c8
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c45
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c58
-rw-r--r--fs/nfs/fscache.c25
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c154
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4namespace.c30
-rw-r--r--fs/nfs/nfs4proc.c58
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c1462
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c528
-rw-r--r--fs/nfs/write.c94
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c40
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs2acl.c14
-rw-r--r--fs/nfsd/nfs3acl.c3
-rw-r--r--fs/nfsd/nfs3proc.c1
-rw-r--r--fs/nfsd/nfs3xdr.c75
-rw-r--r--fs/nfsd/nfs4acl.c6
-rw-r--r--fs/nfsd/nfs4callback.c263
-rw-r--r--fs/nfsd/nfs4idmap.c21
-rw-r--r--fs/nfsd/nfs4proc.c90
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c746
-rw-r--r--fs/nfsd/nfs4xdr.c43
-rw-r--r--fs/nfsd/nfsctl.c31
-rw-r--r--fs/nfsd/nfsfh.c159
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c57
-rw-r--r--fs/nfsd/vfs.c75
-rw-r--r--fs/nfsd/vfs.h98
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/alloc.c108
-rw-r--r--fs/nilfs2/alloc.h21
-rw-r--r--fs/nilfs2/bmap.c159
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c81
-rw-r--r--fs/nilfs2/btnode.h6
-rw-r--r--fs/nilfs2/btree.c725
-rw-r--r--fs/nilfs2/btree.h22
-rw-r--r--fs/nilfs2/cpfile.c39
-rw-r--r--fs/nilfs2/cpfile.h5
-rw-r--r--fs/nilfs2/dat.c89
-rw-r--r--fs/nilfs2/dat.h11
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/file.c6
-rw-r--r--fs/nilfs2/gcdat.c3
-rw-r--r--fs/nilfs2/gcinode.c8
-rw-r--r--fs/nilfs2/ifile.c35
-rw-r--r--fs/nilfs2/ifile.h3
-rw-r--r--fs/nilfs2/inode.c13
-rw-r--r--fs/nilfs2/ioctl.c65
-rw-r--r--fs/nilfs2/mdt.c96
-rw-r--r--fs/nilfs2/mdt.h24
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h14
-rw-r--r--fs/nilfs2/recovery.c35
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c87
-rw-r--r--fs/nilfs2/sufile.c203
-rw-r--r--fs/nilfs2/sufile.h15
-rw-r--r--fs/nilfs2/super.c188
-rw-r--r--fs/nilfs2/the_nilfs.c166
-rw-r--r--fs/nilfs2/the_nilfs.h53
-rw-r--r--fs/nls/nls_base.c11
-rw-r--r--fs/notify/dnotify/dnotify.c3
-rw-r--r--fs/notify/inode_mark.c6
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/file.c54
-rw-r--r--fs/ntfs/layout.h2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ntfs/super.c10
-rw-r--r--fs/ocfs2/Kconfig10
-rw-r--r--fs/ocfs2/Makefile8
-rw-r--r--fs/ocfs2/acl.h22
-rw-r--r--fs/ocfs2/alloc.c1342
-rw-r--r--fs/ocfs2/alloc.h101
-rw-r--r--fs/ocfs2/aops.c38
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/buffer_head_io.c47
-rw-r--r--fs/ocfs2/buffer_head_io.h8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/dir.c107
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c11
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmthread.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/dlmglue.c105
-rw-r--r--fs/ocfs2/dlmglue.h6
-rw-r--r--fs/ocfs2/extent_map.c33
-rw-r--r--fs/ocfs2/extent_map.h8
-rw-r--r--fs/ocfs2/file.c203
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c86
-rw-r--r--fs/ocfs2/inode.h20
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c82
-rw-r--r--fs/ocfs2/journal.h94
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/namei.c341
-rw-r--r--fs/ocfs2/namei.h6
-rw-r--r--fs/ocfs2/ocfs2.h67
-rw-r--r--fs/ocfs2/ocfs2_fs.h107
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c9
-rw-r--r--fs/ocfs2/quota_local.c28
-rw-r--r--fs/ocfs2/refcounttree.c4372
-rw-r--r--fs/ocfs2/refcounttree.h106
-rw-r--r--fs/ocfs2/resize.c16
-rw-r--r--fs/ocfs2/slot_map.c10
-rw-r--r--fs/ocfs2/suballoc.c35
-rw-r--r--fs/ocfs2/super.c133
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/ocfs2/uptodate.c270
-rw-r--r--fs/ocfs2/uptodate.h51
-rw-r--r--fs/ocfs2/xattr.c2060
-rw-r--r--fs/ocfs2/xattr.h17
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c6
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h10
-rw-r--r--fs/open.c17
-rw-r--r--fs/partitions/check.c16
-rw-r--r--fs/pipe.c41
-rw-r--r--fs/proc/array.c92
-rw-r--r--fs/proc/base.c70
-rw-r--r--fs/proc/kcore.c334
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c10
-rw-r--r--fs/proc/proc_devtree.c41
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c57
-rw-r--r--fs/proc/uptime.c7
-rw-r--r--fs/qnx4/Kconfig11
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c81
-rw-r--r--fs/qnx4/dir.c5
-rw-r--r--fs/qnx4/file.c40
-rw-r--r--fs/qnx4/inode.c84
-rw-r--r--fs/qnx4/namei.c105
-rw-r--r--fs/qnx4/qnx4.h8
-rw-r--r--fs/qnx4/truncate.c34
-rw-r--r--fs/quota/Kconfig2
-rw-r--r--fs/quota/dquot.c99
-rw-r--r--fs/quota/quota.c93
-rw-r--r--fs/quota/quota_v1.c2
-rw-r--r--fs/quota/quota_v2.c2
-rw-r--r--fs/ramfs/file-nommu.c18
-rw-r--r--fs/ramfs/inode.c5
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/select.c15
-rw-r--r--fs/seq_file.c74
-rw-r--r--fs/smbfs/inode.c10
-rw-r--r--fs/smbfs/proc.c2
-rw-r--r--fs/splice.c30
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c80
-rw-r--r--fs/sync.c95
-rw-r--r--fs/sysfs/bin.c4
-rw-r--r--fs/sysfs/dir.c8
-rw-r--r--fs/sysfs/file.c14
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c34
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c77
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c61
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c152
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c58
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c136
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c66
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c42
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c806
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c30
-rw-r--r--fs/xfs/xfs_inode.c12
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c28
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_iomap.c9
-rw-r--r--fs/xfs/xfs_itable.c119
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c30
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h14
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c23
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c97
-rw-r--r--fs/xfs/xfs_vnodeops.h1
582 files changed, 53309 insertions, 14806 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90bc..795233702a4e 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
See <http://v9fs.sf.net> for more information.
If unsure, say N.
+
+config 9P_FSCACHE
+ bool "Enable 9P client caching support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for 9p clients using FS-Cache
+
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e6..1a940ec7af61 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dir.o \
vfs_dentry.o \
v9fs.o \
- fid.o \
+ fid.o
+9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 000000000000..51c94e26a346
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
+/*
+ * V9FS cache definitions.
+ *
+ * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include <linux/jiffies.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+
+#include "v9fs.h"
+#include "cache.h"
+
+#define CACHETAG_LEN 11
+
+struct kmem_cache *vcookie_cache;
+
+struct fscache_netfs v9fs_cache_netfs = {
+ .name = "9p",
+ .version = 0,
+};
+
+static void init_once(void *foo)
+{
+ struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
+ vcookie->fscache = NULL;
+ vcookie->qid = NULL;
+ inode_init_once(&vcookie->inode);
+}
+
+/**
+ * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
+ * vcookie to inode mapping
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_init_vcookiecache(void)
+{
+ vcookie_cache = kmem_cache_create("vcookie_cache",
+ sizeof(struct v9fs_cookie),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (!vcookie_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * v9fs_destroy_vcookiecache - destroy the cache of vcookies
+ *
+ */
+
+static void v9fs_destroy_vcookiecache(void)
+{
+ kmem_cache_destroy(vcookie_cache);
+}
+
+int __v9fs_cache_register(void)
+{
+ int ret;
+ ret = v9fs_init_vcookiecache();
+ if (ret < 0)
+ return ret;
+
+ return fscache_register_netfs(&v9fs_cache_netfs);
+}
+
+void __v9fs_cache_unregister(void)
+{
+ v9fs_destroy_vcookiecache();
+ fscache_unregister_netfs(&v9fs_cache_netfs);
+}
+
+/**
+ * v9fs_random_cachetag - Generate a random tag to be associated
+ * with a new cache session.
+ *
+ * The value of jiffies is used for a fairly randomly cache tag.
+ */
+
+static
+int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+{
+ v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
+ if (!v9ses->cachetag)
+ return -ENOMEM;
+
+ return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
+}
+
+static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct v9fs_session_info *v9ses;
+ uint16_t klen = 0;
+
+ v9ses = (struct v9fs_session_info *)cookie_netfs_data;
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
+ buffer, bufmax);
+
+ if (v9ses->cachetag)
+ klen = strlen(v9ses->cachetag);
+
+ if (klen > bufmax)
+ return 0;
+
+ memcpy(buffer, v9ses->cachetag, klen);
+ P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+ return klen;
+}
+
+const struct fscache_cookie_def v9fs_cache_session_index_def = {
+ .name = "9P.session",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = v9fs_cache_session_get_key,
+};
+
+void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
+{
+ /* If no cache session tag was specified, we generate a random one. */
+ if (!v9ses->cachetag)
+ v9fs_random_cachetag(v9ses);
+
+ v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
+ &v9fs_cache_session_index_def,
+ v9ses);
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
+ v9ses->fscache);
+}
+
+void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
+{
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
+ v9ses->fscache);
+ fscache_relinquish_cookie(v9ses->fscache, 0);
+ v9ses->fscache = NULL;
+}
+
+
+static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+ vcookie->qid->path);
+ return sizeof(vcookie->qid->path);
+}
+
+static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ *size = i_size_read(&vcookie->inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+ *size);
+}
+
+static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+ vcookie->qid->version);
+ return sizeof(vcookie->qid->version);
+}
+
+static enum
+fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+
+ if (buflen != sizeof(vcookie->qid->version))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ if (memcmp(buffer, &vcookie->qid->version,
+ sizeof(vcookie->qid->version)))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
+{
+ struct v9fs_cookie *vcookie = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ for (;;) {
+ nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+ first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+const struct fscache_cookie_def v9fs_cache_inode_index_def = {
+ .name = "9p.inode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = v9fs_cache_inode_get_key,
+ .get_attr = v9fs_cache_inode_get_attr,
+ .get_aux = v9fs_cache_inode_get_aux,
+ .check_aux = v9fs_cache_inode_check_aux,
+ .now_uncached = v9fs_cache_inode_now_uncached,
+};
+
+void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie;
+ struct v9fs_session_info *v9ses;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ vcookie = v9fs_inode2cookie(inode);
+ if (vcookie->fscache)
+ return;
+
+ v9ses = v9fs_inode2v9ses(inode);
+ vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+ &v9fs_cache_inode_index_def,
+ vcookie);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
+ vcookie->fscache);
+}
+
+void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ if (!vcookie->fscache)
+ return;
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
+ vcookie->fscache);
+
+ fscache_relinquish_cookie(vcookie->fscache, 0);
+ vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_flush_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ if (!vcookie->fscache)
+ return;
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
+ vcookie->fscache);
+
+ fscache_relinquish_cookie(vcookie->fscache, 1);
+ vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ struct p9_fid *fid;
+
+ if (!vcookie->fscache)
+ return;
+
+ spin_lock(&vcookie->lock);
+ fid = filp->private_data;
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+ v9fs_cache_inode_flush_cookie(inode);
+ else
+ v9fs_cache_inode_get_cookie(inode);
+
+ spin_unlock(&vcookie->lock);
+}
+
+void v9fs_cache_inode_reset_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ struct v9fs_session_info *v9ses;
+ struct fscache_cookie *old;
+
+ if (!vcookie->fscache)
+ return;
+
+ old = vcookie->fscache;
+
+ spin_lock(&vcookie->lock);
+ fscache_relinquish_cookie(vcookie->fscache, 1);
+
+ v9ses = v9fs_inode2v9ses(inode);
+ vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+ &v9fs_cache_inode_index_def,
+ vcookie);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
+ inode, old, vcookie->fscache);
+
+ spin_unlock(&vcookie->lock);
+}
+
+int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ struct inode *inode = page->mapping->host;
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ BUG_ON(!vcookie->fscache);
+
+ if (PageFsCache(page)) {
+ if (fscache_check_page_write(vcookie->fscache, page)) {
+ if (!(gfp & __GFP_WAIT))
+ return 0;
+ fscache_wait_on_page_write(vcookie->fscache, page);
+ }
+
+ fscache_uncache_page(vcookie->fscache, page);
+ ClearPageFsCache(page);
+ }
+
+ return 1;
+}
+
+void __v9fs_fscache_invalidate_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ BUG_ON(!vcookie->fscache);
+
+ if (PageFsCache(page)) {
+ fscache_wait_on_page_write(vcookie->fscache, page);
+ BUG_ON(!PageLocked(page));
+ fscache_uncache_page(vcookie->fscache, page);
+ ClearPageFsCache(page);
+ }
+}
+
+static void v9fs_vfs_readpage_complete(struct page *page, void *data,
+ int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+}
+
+/**
+ * __v9fs_readpage_from_fscache - read a page from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ if (!vcookie->fscache)
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_page(vcookie->fscache,
+ page,
+ v9fs_vfs_readpage_complete,
+ NULL,
+ GFP_KERNEL);
+ switch (ret) {
+ case -ENOBUFS:
+ case -ENODATA:
+ P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+ return 1;
+ case 0:
+ P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ return ret;
+ default:
+ P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ return ret;
+ }
+}
+
+/**
+ * __v9fs_readpages_from_fscache - read multiple pages from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+ if (!vcookie->fscache)
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_pages(vcookie->fscache,
+ mapping, pages, nr_pages,
+ v9fs_vfs_readpage_complete,
+ NULL,
+ mapping_gfp_mask(mapping));
+ switch (ret) {
+ case -ENOBUFS:
+ case -ENODATA:
+ P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+ return 1;
+ case 0:
+ BUG_ON(!list_empty(pages));
+ BUG_ON(*nr_pages != 0);
+ P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ return ret;
+ default:
+ P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ return ret;
+ }
+}
+
+/**
+ * __v9fs_readpage_to_fscache - write a page to the cache
+ *
+ */
+
+void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+ P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
+ if (ret != 0)
+ v9fs_uncache_page(inode, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 000000000000..a94192bfaee8
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
+/*
+ * V9FS cache definitions.
+ *
+ * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#ifndef _9P_CACHE_H
+#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
+#include <linux/spinlock.h>
+
+extern struct kmem_cache *vcookie_cache;
+
+struct v9fs_cookie {
+ spinlock_t lock;
+ struct inode inode;
+ struct fscache_cookie *fscache;
+ struct p9_qid *qid;
+};
+
+static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
+{
+ return container_of(inode, struct v9fs_cookie, inode);
+}
+
+extern struct fscache_netfs v9fs_cache_netfs;
+extern const struct fscache_cookie_def v9fs_cache_session_index_def;
+extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+
+extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
+extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+
+extern void v9fs_cache_inode_get_cookie(struct inode *inode);
+extern void v9fs_cache_inode_put_cookie(struct inode *inode);
+extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
+extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
+extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
+
+extern int __v9fs_cache_register(void);
+extern void __v9fs_cache_unregister(void);
+
+extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
+extern void __v9fs_fscache_invalidate_page(struct page *page);
+extern int __v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page);
+extern int __v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages);
+extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+
+
+/**
+ * v9fs_cache_register - Register v9fs file system with the cache
+ */
+static inline int v9fs_cache_register(void)
+{
+ return __v9fs_cache_register();
+}
+
+/**
+ * v9fs_cache_unregister - Unregister v9fs from the cache
+ */
+static inline void v9fs_cache_unregister(void)
+{
+ __v9fs_cache_unregister();
+}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+ gfp_t gfp)
+{
+ return __v9fs_fscache_release_page(page, gfp);
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page)
+{
+ __v9fs_fscache_invalidate_page(page);
+}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ return __v9fs_readpage_from_fscache(inode, page);
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return __v9fs_readpages_from_fscache(inode, mapping, pages,
+ nr_pages);
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ __v9fs_readpage_to_fscache(inode, page);
+}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ fscache_uncache_page(vcookie->fscache, page);
+ BUG_ON(PageFsCache(page));
+}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+ struct p9_qid *qid)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ spin_lock(&vcookie->lock);
+ vcookie->qid = qid;
+ spin_unlock(&vcookie->lock);
+}
+
+#else /* CONFIG_9P_FSCACHE */
+
+static inline int v9fs_cache_register(void)
+{
+ return 1;
+}
+
+static inline void v9fs_cache_unregister(void) {}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+ gfp_t gfp) {
+ return 1;
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page) {}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+ struct p9_qid *qid)
+{}
+
+#endif /* CONFIG_9P_FSCACHE */
+#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f7003cfac63d..cf62b05e296a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
#include <net/9p/transport.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
+#include "cache.h"
+
+static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
+static LIST_HEAD(v9fs_sessionlist);
/*
- * Option Parsing (code inspired by NFS code)
- * NOTE: each transport will parse its own options
- */
+ * Option Parsing (code inspired by NFS code)
+ * NOTE: each transport will parse its own options
+ */
enum {
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
- Opt_uname, Opt_remotename, Opt_trans,
+ Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
- Opt_cache_loose,
+ Opt_cache_loose, Opt_fscache,
/* Access options */
Opt_access,
/* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
{Opt_uname, "uname=%s"},
{Opt_remotename, "aname=%s"},
{Opt_nodevmap, "nodevmap"},
- {Opt_cache_loose, "cache=loose"},
+ {Opt_cache, "cache=%s"},
{Opt_cache_loose, "loose"},
+ {Opt_fscache, "fscache"},
+ {Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_err, NULL}
};
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->afid = ~0;
v9ses->debug = 0;
v9ses->cache = 0;
+#ifdef CONFIG_9P_FSCACHE
+ v9ses->cachetag = NULL;
+#endif
if (!opts)
return 0;
options = kstrdup(opts, GFP_KERNEL);
- if (!options) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- return -ENOMEM;
- }
+ if (!options)
+ goto fail_option_alloc;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_cache_loose:
v9ses->cache = CACHE_LOOSE;
break;
+ case Opt_fscache:
+ v9ses->cache = CACHE_FSCACHE;
+ break;
+ case Opt_cachetag:
+#ifdef CONFIG_9P_FSCACHE
+ v9ses->cachetag = match_strdup(&args[0]);
+#endif
+ break;
+ case Opt_cache:
+ s = match_strdup(&args[0]);
+ if (!s)
+ goto fail_option_alloc;
+
+ if (strcmp(s, "loose") == 0)
+ v9ses->cache = CACHE_LOOSE;
+ else if (strcmp(s, "fscache") == 0)
+ v9ses->cache = CACHE_FSCACHE;
+ else
+ v9ses->cache = CACHE_NONE;
+ kfree(s);
+ break;
case Opt_access:
s = match_strdup(&args[0]);
- if (!s) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy"
- " of option argument\n");
- ret = -ENOMEM;
- break;
- }
+ if (!s)
+ goto fail_option_alloc;
+
v9ses->flags &= ~V9FS_ACCESS_MASK;
if (strcmp(s, "user") == 0)
v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
}
kfree(options);
return ret;
+
+fail_option_alloc:
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "failed to allocate copy of option argument\n");
+ return -ENOMEM;
}
/**
@@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
return ERR_PTR(-ENOMEM);
}
+ spin_lock(&v9fs_sessionlist_lock);
+ list_add(&v9ses->slist, &v9fs_sessionlist);
+ spin_unlock(&v9fs_sessionlist_lock);
+
v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
strcpy(v9ses->uname, V9FS_DEFUSER);
strcpy(v9ses->aname, V9FS_DEFANAME);
@@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
else
fid->uid = ~0;
+#ifdef CONFIG_9P_FSCACHE
+ /* register the session for caching */
+ v9fs_cache_session_get_cookie(v9ses);
+#endif
+
return fid;
error:
@@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
v9ses->clnt = NULL;
}
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->fscache) {
+ v9fs_cache_session_put_cookie(v9ses);
+ kfree(v9ses->cachetag);
+ }
+#endif
__putname(v9ses->uname);
__putname(v9ses->aname);
+
+ spin_lock(&v9fs_sessionlist_lock);
+ list_del(&v9ses->slist);
+ spin_unlock(&v9fs_sessionlist_lock);
}
/**
@@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
extern int v9fs_error_init(void);
+static struct kobject *v9fs_kobj;
+
+#ifdef CONFIG_9P_FSCACHE
/**
- * v9fs_init - Initialize module
+ * caches_show - list caches associated with a session
+ *
+ * Returns the size of buffer written.
+ */
+
+static ssize_t caches_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t n = 0, count = 0, limit = PAGE_SIZE;
+ struct v9fs_session_info *v9ses;
+
+ spin_lock(&v9fs_sessionlist_lock);
+ list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
+ if (v9ses->cachetag) {
+ n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+ if (n < 0) {
+ count = n;
+ break;
+ }
+
+ count += n;
+ limit -= n;
+ }
+ }
+
+ spin_unlock(&v9fs_sessionlist_lock);
+ return count;
+}
+
+static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
+#endif /* CONFIG_9P_FSCACHE */
+
+static struct attribute *v9fs_attrs[] = {
+#ifdef CONFIG_9P_FSCACHE
+ &v9fs_attr_cache.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group v9fs_attr_group = {
+ .attrs = v9fs_attrs,
+};
+
+/**
+ * v9fs_sysfs_init - Initialize the v9fs sysfs interface
+ *
+ */
+
+static int v9fs_sysfs_init(void)
+{
+ v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
+ if (!v9fs_kobj)
+ return -ENOMEM;
+
+ if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+ kobject_put(v9fs_kobj);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/**
+ * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
+ *
+ */
+
+static void v9fs_sysfs_cleanup(void)
+{
+ sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
+ kobject_put(v9fs_kobj);
+}
+
+/**
+ * init_v9fs - Initialize module
*
*/
static int __init init_v9fs(void)
{
+ int err;
printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
- return register_filesystem(&v9fs_fs_type);
+ err = register_filesystem(&v9fs_fs_type);
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register filesystem\n");
+ return err;
+ }
+
+ err = v9fs_cache_register();
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register v9fs for caching\n");
+ goto out_fs_unreg;
+ }
+
+ err = v9fs_sysfs_init();
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register with sysfs\n");
+ goto out_sysfs_cleanup;
+ }
+
+ return 0;
+
+out_sysfs_cleanup:
+ v9fs_sysfs_cleanup();
+
+out_fs_unreg:
+ unregister_filesystem(&v9fs_fs_type);
+
+ return err;
}
/**
- * v9fs_init - shutdown module
+ * exit_v9fs - shutdown module
*
*/
static void __exit exit_v9fs(void)
{
+ v9fs_sysfs_cleanup();
+ v9fs_cache_unregister();
unregister_filesystem(&v9fs_fs_type);
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 38762bf102a9..019f4ccb70c1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
enum p9_cache_modes {
CACHE_NONE,
CACHE_LOOSE,
+ CACHE_FSCACHE,
};
/**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
* @debug: debug level
* @afid: authentication handle
* @cache: cache mode of type &p9_cache_modes
+ * @cachetag: the tag of the cache associated with this session
+ * @fscache: session cookie associated with FS-Cache
* @options: copy of options string given by user
* @uname: string user name to mount hierarchy as
* @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
* @dfltgid: default numeric groupid to mount hierarchy as
* @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
* @clnt: reference to 9P network client instantiated for this session
- * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug
+ * @slist: reference to list of registered 9p sessions
*
* This structure holds state for each session instance established during
* a sys_mount() .
@@ -84,6 +87,10 @@ struct v9fs_session_info {
unsigned short debug;
unsigned int afid;
unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+ char *cachetag;
+ struct fscache_cookie *fscache;
+#endif
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
@@ -92,11 +99,9 @@ struct v9fs_session_info {
unsigned int dfltgid; /* default gid for legacy support */
u32 uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
- struct dentry *debugfs_dir;
+ struct list_head slist; /* list of sessions registered with v9fs */
};
-extern struct dentry *v9fs_debugfs_root;
-
struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
char *);
void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e205..3a7560e35865 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
extern const struct dentry_operations v9fs_dentry_operations;
extern const struct dentry_operations v9fs_cached_dentry_operations;
+#ifdef CONFIG_9P_FSCACHE
+struct inode *v9fs_alloc_inode(struct super_block *sb);
+void v9fs_destroy_inode(struct inode *inode);
+#endif
+
struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+void v9fs_clear_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30b..90e38449f4b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
+#include "cache.h"
/**
* v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
int retval;
loff_t offset;
char *buffer;
+ struct inode *inode;
+ inode = page->mapping->host;
P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+ BUG_ON(!PageLocked(page));
+
+ retval = v9fs_readpage_from_fscache(inode, page);
+ if (retval == 0)
+ return retval;
+
buffer = kmap(page);
offset = page_offset(page);
retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
- if (retval < 0)
+ if (retval < 0) {
+ v9fs_uncache_page(inode, page);
goto done;
+ }
memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
flush_dcache_page(page);
SetPageUptodate(page);
+
+ v9fs_readpage_to_fscache(inode, page);
retval = 0;
done:
@@ -72,6 +86,78 @@ done:
return retval;
}
+/**
+ * v9fs_vfs_readpages - read a set of pages from 9P
+ *
+ * @filp: file being read
+ * @mapping: the address space
+ * @pages: list of pages to read
+ * @nr_pages: count of pages to read
+ *
+ */
+
+static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ int ret = 0;
+ struct inode *inode;
+
+ inode = mapping->host;
+ P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+
+ ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
+ if (ret == 0)
+ return ret;
+
+ ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+ P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
+ return ret;
+}
+
+/**
+ * v9fs_release_page - release the private state associated with a page
+ *
+ * Returns 1 if the page can be released, false otherwise.
+ */
+
+static int v9fs_release_page(struct page *page, gfp_t gfp)
+{
+ if (PagePrivate(page))
+ return 0;
+
+ return v9fs_fscache_release_page(page, gfp);
+}
+
+/**
+ * v9fs_invalidate_page - Invalidate a page completely or partially
+ *
+ * @page: structure to page
+ * @offset: offset in the page
+ */
+
+static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+{
+ if (offset == 0)
+ v9fs_fscache_invalidate_page(page);
+}
+
+/**
+ * v9fs_launder_page - Writeback a dirty page
+ * Since the writes go directly to the server, we simply return a 0
+ * here to indicate success.
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_launder_page(struct page *page)
+{
+ return 0;
+}
+
const struct address_space_operations v9fs_addr_operations = {
.readpage = v9fs_vfs_readpage,
+ .readpages = v9fs_vfs_readpages,
+ .releasepage = v9fs_release_page,
+ .invalidatepage = v9fs_invalidate_page,
+ .launder_page = v9fs_launder_page,
};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 873cd31baa47..15cce53bf61e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -40,6 +40,24 @@
#include "fid.h"
/**
+ * struct p9_rdir - readdir accounting
+ * @mutex: mutex protecting readdir
+ * @head: start offset of current dirread buffer
+ * @tail: end offset of current dirread buffer
+ * @buf: dirread buffer
+ *
+ * private structure for keeping track of readdir
+ * allocated on demand
+ */
+
+struct p9_rdir {
+ struct mutex mutex;
+ int head;
+ int tail;
+ uint8_t *buf;
+};
+
+/**
* dt_type - return file type
* @mistat: mistat structure
*
@@ -70,56 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
int over;
struct p9_wstat st;
- int err;
+ int err = 0;
struct p9_fid *fid;
int buflen;
- char *statbuf;
- int n, i = 0;
+ int reclen = 0;
+ struct p9_rdir *rdir;
P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
- statbuf = kmalloc(buflen, GFP_KERNEL);
- if (!statbuf)
- return -ENOMEM;
-
- while (1) {
- err = v9fs_file_readn(filp, statbuf, NULL, buflen,
- fid->rdir_fpos);
- if (err <= 0)
- break;
-
- n = err;
- while (i < n) {
- err = p9stat_read(statbuf + i, buflen-i, &st,
- fid->clnt->dotu);
+
+ /* allocate rdir on demand */
+ if (!fid->rdir) {
+ rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+
+ if (rdir == NULL) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ spin_lock(&filp->f_dentry->d_lock);
+ if (!fid->rdir) {
+ rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
+ mutex_init(&rdir->mutex);
+ rdir->head = rdir->tail = 0;
+ fid->rdir = (void *) rdir;
+ rdir = NULL;
+ }
+ spin_unlock(&filp->f_dentry->d_lock);
+ kfree(rdir);
+ }
+ rdir = (struct p9_rdir *) fid->rdir;
+
+ err = mutex_lock_interruptible(&rdir->mutex);
+ while (err == 0) {
+ if (rdir->tail == rdir->head) {
+ err = v9fs_file_readn(filp, rdir->buf, NULL,
+ buflen, filp->f_pos);
+ if (err <= 0)
+ goto unlock_and_exit;
+
+ rdir->head = 0;
+ rdir->tail = err;
+ }
+
+ while (rdir->head < rdir->tail) {
+ err = p9stat_read(rdir->buf + rdir->head,
+ buflen - rdir->head, &st,
+ fid->clnt->dotu);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
p9stat_free(&st);
- goto free_and_exit;
+ goto unlock_and_exit;
}
-
- i += st.size+2;
- fid->rdir_fpos += st.size+2;
+ reclen = st.size+2;
over = filldir(dirent, st.name, strlen(st.name),
filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
- filp->f_pos += st.size+2;
-
p9stat_free(&st);
if (over) {
err = 0;
- goto free_and_exit;
+ goto unlock_and_exit;
}
+ rdir->head += reclen;
+ filp->f_pos += reclen;
}
}
-free_and_exit:
- kfree(statbuf);
+unlock_and_exit:
+ mutex_unlock(&rdir->mutex);
+exit:
return err;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c389..3902bf43a088 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/list.h>
+#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <linux/idr.h>
#include <net/9p/9p.h>
@@ -40,6 +41,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
+#include "cache.h"
static const struct file_operations v9fs_cached_file_operations;
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
return err;
}
if (omode & P9_OTRUNC) {
- inode->i_size = 0;
+ i_size_write(inode, 0);
inode->i_blocks = 0;
}
if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
/* enable cached file options */
if(file->f_op == &v9fs_file_operations)
file->f_op = &v9fs_cached_file_operations;
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_cache_inode_set_cookie(inode, file);
+#endif
}
return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
struct p9_client *clnt;
struct inode *inode = filp->f_path.dentry->d_inode;
int origin = *offset;
+ unsigned long pg_start, pg_end;
P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
(int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
if (count < rsize)
rsize = count;
- n = p9_client_write(fid, NULL, data+total, *offset+total,
+ n = p9_client_write(fid, NULL, data+total, origin+total,
rsize);
if (n <= 0)
break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
} while (count > 0);
if (total > 0) {
- invalidate_inode_pages2_range(inode->i_mapping, origin,
- origin+total);
+ pg_start = origin >> PAGE_CACHE_SHIFT;
+ pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
+ if (inode->i_mapping && inode->i_mapping->nrpages)
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
*offset += total;
- }
-
- if (*offset > inode->i_size) {
- inode->i_size = *offset;
- inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+ i_size_write(inode, i_size_read(inode) + total);
+ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
}
if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 06a223d50a81..18f74ec4dce9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
+#include "cache.h"
static const struct inode_operations v9fs_dir_inode_operations;
static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
wstat->extension = NULL;
}
+#ifdef CONFIG_9P_FSCACHE
+/**
+ * v9fs_alloc_inode - helper function to allocate an inode
+ * This callback is executed before setting up the inode so that we
+ * can associate a vcookie with each inode.
+ *
+ */
+
+struct inode *v9fs_alloc_inode(struct super_block *sb)
+{
+ struct v9fs_cookie *vcookie;
+ vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+ GFP_KERNEL);
+ if (!vcookie)
+ return NULL;
+
+ vcookie->fscache = NULL;
+ vcookie->qid = NULL;
+ spin_lock_init(&vcookie->lock);
+ return &vcookie->inode;
+}
+
+/**
+ * v9fs_destroy_inode - destroy an inode
+ *
+ */
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+}
+#endif
+
/**
* v9fs_get_inode - helper function to setup an inode
* @sb: superblock
@@ -326,6 +360,21 @@ error:
}
*/
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+void v9fs_clear_inode(struct inode *inode)
+{
+ filemap_fdatawrite(inode->i_mapping);
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_cache_inode_put_cookie(inode);
+#endif
+}
+
/**
* v9fs_inode_from_fid - populate an inode by issuing a attribute request
* @v9ses: session information
@@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
v9fs_stat2inode(st, ret, sb);
ret->i_ino = v9fs_qid2ino(&st->qid);
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_vcookie_set_qid(ret, &st->qid);
+ v9fs_cache_inode_get_cookie(ret);
+#endif
p9stat_free(st);
kfree(st);
+
return ret;
error:
@@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_inode2v9ses(dentry->d_inode);
- if (v9ses->cache == CACHE_LOOSE)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return simple_getattr(mnt, dentry, stat);
fid = v9fs_fid_lookup(dentry);
@@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
} else
inode->i_rdev = 0;
- inode->i_size = stat->length;
+ i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
}
/**
@@ -939,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
P9_DPRINTK(P9_DEBUG_VFS,
"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
- retval = buflen;
-
+ retval = strnlen(buffer, buflen);
done:
kfree(st);
return retval;
@@ -1007,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
__putname(link);
link = ERR_PTR(len);
} else
- link[len] = 0;
+ link[min(len, PATH_MAX-1)] = 0;
}
nd_set_link(nd, link);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8961f1a8f668..14a86448572c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
#include "v9fs_vfs.h"
#include "fid.h"
-static void v9fs_clear_inode(struct inode *);
static const struct super_operations v9fs_super_ops;
/**
- * v9fs_clear_inode - release an inode
- * @inode: inode to release
- *
- */
-
-static void v9fs_clear_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
-}
-
-/**
* v9fs_set_super - set the superblock
* @s: super block
* @data: file system specific data
@@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb)
}
static const struct super_operations v9fs_super_ops = {
+#ifdef CONFIG_9P_FSCACHE
+ .alloc_inode = v9fs_alloc_inode,
+ .destroy_inode = v9fs_destroy_inode,
+#endif
.statfs = simple_statfs,
.clear_inode = v9fs_clear_inode,
.show_options = generic_show_options,
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..0e0d65dccdbd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
endif # BLOCK
@@ -108,6 +109,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Virtual memory file system support (former shm fs)"
+ depends on SHMEM
help
Tmpfs is a file system which keeps all files in virtual memory.
@@ -133,8 +135,8 @@ config TMPFS_POSIX_ACL
config HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
- (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
+ depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
+ SYS_SUPPORTS_HUGETLBFS || BROKEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -186,7 +188,6 @@ source "fs/romfs/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
endif # MISC_FILESYSTEMS
@@ -233,6 +234,7 @@ config NFS_COMMON
source "net/sunrpc/Kconfig"
source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
source "fs/ncpfs/Kconfig"
source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..5ef73a0a9f25 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -124,3 +124,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_EXOFS_FS) += exofs/
+obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
int create)
{
- if (block < 0)
- goto abort_negative;
-
if (!create) {
if (block >= inode->i_blocks)
goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
/* don't support allocation of blocks yet */
return -EIO;
-abort_negative:
- adfs_error(inode->i_sb, "block %d < 0", block);
- return -EIO;
-
abort_toobig:
return 0;
}
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* AFS local cache management interface
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/fscache.h>
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
static struct workqueue_struct *afs_lock_manager;
static DEFINE_MUTEX(afs_lock_manager_mutex);
-static struct file_lock_operations afs_lock_ops = {
+static const struct file_lock_operations afs_lock_ops = {
.fl_copy_lock = afs_fl_copy_lock,
.fl_release_private = afs_fl_release_private,
};
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
+#include <linux/fscache.h>
#include "afs.h"
#include "afs_vl.h"
-#include "cache.h"
#define AFS_CELL_MAX_ADDRS 15
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 8630615e57fe..852739d262a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v);
static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
size_t size, loff_t *_pos);
-static struct seq_operations afs_proc_cells_ops = {
+static const struct seq_operations afs_proc_cells_ops = {
.start = afs_proc_cells_start,
.next = afs_proc_cells_next,
.stop = afs_proc_cells_stop,
@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
-static struct seq_operations afs_proc_cell_volumes_ops = {
+static const struct seq_operations afs_proc_cell_volumes_ops = {
.start = afs_proc_cell_volumes_start,
.next = afs_proc_cell_volumes_next,
.stop = afs_proc_cell_volumes_stop,
@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
-static struct seq_operations afs_proc_cell_vlservers_ops = {
+static const struct seq_operations afs_proc_cell_vlservers_ops = {
.start = afs_proc_cell_vlservers_start,
.next = afs_proc_cell_vlservers_next,
.stop = afs_proc_cell_vlservers_stop,
@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
-static struct seq_operations afs_proc_cell_servers_ops = {
+static const struct seq_operations afs_proc_cell_servers_ops = {
.start = afs_proc_cell_servers_start,
.next = afs_proc_cell_servers_next,
.stop = afs_proc_cell_servers_stop,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..5e15a21dbf9f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -671,7 +671,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
ssize_t result;
size_t count = iov_length(iov, nr_segs);
- int ret;
_enter("{%x.%u},{%zu},%lu,",
vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
@@ -691,13 +690,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
return result;
}
- /* return error values for O_SYNC and IS_SYNC() */
- if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
- ret = afs_fsync(iocb->ki_filp, dentry, 1);
- if (ret < 0)
- result = ret;
- }
-
_leave(" = %zd", result);
return result;
}
@@ -712,7 +704,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_writepages = 1,
.range_cyclic = 1,
};
int ret;
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..02a2c9340573 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/mmu_context.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -34,7 +35,6 @@
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
#if DEBUG > 1
#define dprintk printk
@@ -78,6 +78,7 @@ static int __init aio_setup(void)
return 0;
}
+__initcall(aio_setup);
static void aio_free_ring(struct kioctx *ctx)
{
@@ -380,6 +381,7 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
__set_current_state(TASK_RUNNING);
return iocb->ki_user_data;
}
+EXPORT_SYMBOL(wait_on_sync_kiocb);
/* exit_aio: called when the last user of mm goes away. At this point,
* there is no way for any new requests to be submited or any of the
@@ -573,6 +575,7 @@ int aio_put_req(struct kiocb *req)
spin_unlock_irq(&ctx->ctx_lock);
return ret;
}
+EXPORT_SYMBOL(aio_put_req);
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
@@ -595,51 +598,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
}
/*
- * use_mm
- * Makes the calling kernel thread take on the specified
- * mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void use_mm(struct mm_struct *mm)
-{
- struct mm_struct *active_mm;
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- active_mm = tsk->active_mm;
- atomic_inc(&mm->mm_count);
- tsk->mm = mm;
- tsk->active_mm = mm;
- switch_mm(active_mm, mm, tsk);
- task_unlock(tsk);
-
- mmdrop(active_mm);
-}
-
-/*
- * unuse_mm
- * Reverses the effect of use_mm, i.e. releases the
- * specified mm context which was earlier taken on
- * by the calling kernel thread
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void unuse_mm(struct mm_struct *mm)
-{
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- tsk->mm = NULL;
- /* active_mm is still 'mm' */
- enter_lazy_tlb(mm, tsk);
- task_unlock(tsk);
-}
-
-/*
* Queue up a kiocb to be retried. Assumes that the kiocb
* has already been marked as kicked, and places it on
* the retry run list for the corresponding ioctx, if it
@@ -1037,6 +995,7 @@ put_rq:
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
return ret;
}
+EXPORT_SYMBOL(aio_complete);
/* aio_read_evt
* Pull an event off of the ioctx's event ring. Returns the number of
@@ -1825,9 +1784,3 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
return ret;
}
-
-__initcall(aio_setup);
-
-EXPORT_SYMBOL(aio_complete);
-EXPORT_SYMBOL(aio_put_req);
-EXPORT_SYMBOL(wait_on_sync_kiocb);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47d4a01c5393..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
*
*/
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/fs.h>
@@ -77,28 +79,24 @@ static const struct address_space_operations anon_aops = {
*
* Creates a new file by hooking it on a single inode. This is useful for files
* that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfd() will share a single inode,
+ * All the files created with anon_inode_getfile() will share a single inode,
* hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup. Returns new descriptor or -error.
+ * setup. Returns the newly created file* or an error pointer.
*/
-int anon_inode_getfd(const char *name, const struct file_operations *fops,
- void *priv, int flags)
+struct file *anon_inode_getfile(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags)
{
struct qstr this;
struct dentry *dentry;
struct file *file;
- int error, fd;
+ int error;
if (IS_ERR(anon_inode_inode))
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
if (fops->owner && !try_module_get(fops->owner))
- return -ENOENT;
-
- error = get_unused_fd_flags(flags);
- if (error < 0)
- goto err_module;
- fd = error;
+ return ERR_PTR(-ENOENT);
/*
* Link the inode to a directory entry by creating a unique name
@@ -110,7 +108,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
this.hash = 0;
dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
if (!dentry)
- goto err_put_unused_fd;
+ goto err_module;
/*
* We know the anon_inode inode count is always greater than zero,
@@ -136,16 +134,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
file->f_version = 0;
file->private_data = priv;
+ return file;
+
+err_dput:
+ dput(dentry);
+err_module:
+ module_put(fops->owner);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
+ * anonymous inode, and a dentry that describe the "class"
+ * of the file
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup. Returns new descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+ void *priv, int flags)
+{
+ int error, fd;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = anon_inode_getfile(name, fops, priv, flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
fd_install(fd, file);
return fd;
-err_dput:
- dput(dentry);
err_put_unused_fd:
put_unused_fd(fd);
-err_module:
- module_put(fops->owner);
return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a8..96d394bdaddf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
/* Taken over from the old code... */
/* POSIX UID/GID verification for setting inode attributes. */
-int inode_change_ok(struct inode *inode, struct iattr *attr)
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
{
int retval = -EPERM;
unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
error:
return retval;
}
-
EXPORT_SYMBOL(inode_change_ok);
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode: the inode to be truncated
+ * @offset: the new size to assign to the inode
+ * @Returns: 0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+ if (inode->i_size < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ } else {
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ }
+
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
int inode_setattr(struct inode * inode, struct iattr * attr)
{
unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 2316e944a109..e947915109e5 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
continue;
}
- while (d_mountpoint(path.dentry) && follow_down(&path));
+ while (d_mountpoint(path.dentry) && follow_down(&path))
;
umount_ok = may_umount(path.mnt);
path_put(&path);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..33baf27fac78 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
{
kfree(BEFS_SB(sb)->mount_opts.iocharset);
BEFS_SB(sb)->mount_opts.iocharset = NULL;
-
- if (BEFS_SB(sb)->nls) {
- unload_nls(BEFS_SB(sb)->nls);
- BEFS_SB(sb)->nls = NULL;
- }
-
+ unload_nls(BEFS_SB(sb)->nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
@@ -842,7 +837,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = BEFS_SUPER_MAGIC;
/* Set real blocksize of fs */
sb_set_blocksize(sb, (ulong) befs_sb->block_size);
- sb->s_op = (struct super_operations *) &befs_sops;
+ sb->s_op = &befs_sops;
root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
if (IS_ERR(root)) {
ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
}
}
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out_close;
- }
+ if (last_bss > elf_bss) {
+ /*
+ * Now fill out the bss section. First pad the last page up
+ * to the page boundary, and then perform a mmap to make sure
+ * that there are zero-mapped pages up to and including the
+ * last bss page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out_close;
+ }
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+ /* What we have mapped so far */
+ elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
- /* Map the last of the bss segment */
- if (last_bss > elf_bss) {
+ /* Map the last of the bss segment */
down_write(&current->mm->mmap_sem);
error = do_brk(elf_bss, last_bss - elf_bss);
up_write(&current->mm->mmap_sem);
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static void fill_elf_header(struct elfhdr *elf, int segs,
u16 machine, u32 flags, u8 osabi)
@@ -1714,42 +1711,52 @@ struct elf_note_info {
int numnote;
};
-static int fill_note_info(struct elfhdr *elf, int phdrs,
- struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+static int elf_note_info_init(struct elf_note_info *info)
{
-#define NUM_NOTES 6
- struct list_head *t;
-
- info->notes = NULL;
- info->prstatus = NULL;
- info->psinfo = NULL;
- info->fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
- info->xfpu = NULL;
-#endif
+ memset(info, 0, sizeof(*info));
INIT_LIST_HEAD(&info->thread_list);
- info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
- GFP_KERNEL);
+ /* Allocate space for six ELF notes */
+ info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
if (!info->notes)
return 0;
info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
if (!info->psinfo)
- return 0;
+ goto notes_free;
info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
if (!info->prstatus)
- return 0;
+ goto psinfo_free;
info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
if (!info->fpu)
- return 0;
+ goto prstatus_free;
#ifdef ELF_CORE_COPY_XFPREGS
info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
if (!info->xfpu)
- return 0;
+ goto fpu_free;
+#endif
+ return 1;
+#ifdef ELF_CORE_COPY_XFPREGS
+ fpu_free:
+ kfree(info->fpu);
#endif
+ prstatus_free:
+ kfree(info->prstatus);
+ psinfo_free:
+ kfree(info->psinfo);
+ notes_free:
+ kfree(info->notes);
+ return 0;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+ struct elf_note_info *info,
+ long signr, struct pt_regs *regs)
+{
+ struct list_head *t;
+
+ if (!elf_note_info_init(info))
+ return 0;
- info->thread_status_size = 0;
if (signr) {
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -1809,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
#endif
return 1;
-
-#undef NUM_NOTES
}
static size_t get_note_info_size(struct elf_note_info *info)
@@ -2016,7 +2021,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
goto end_coredump;
/* Align to page */
- DUMP_SEEK(dataoff - foffset);
+ if (!dump_seek(file, dataoff - foffset))
+ goto end_coredump;
for (vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2033,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
- struct vm_area_struct *tmp_vma;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &tmp_vma) <= 0) {
- DUMP_SEEK(PAGE_SIZE);
- } else {
- if (page == ZERO_PAGE(0)) {
- if (!dump_seek(file, PAGE_SIZE)) {
- page_cache_release(page);
- goto end_coredump;
- }
- } else {
- void *kaddr;
- flush_cache_page(tmp_vma, addr,
- page_to_pfn(page));
- kaddr = kmap(page);
- if ((size += PAGE_SIZE) > limit ||
- !dump_write(file, kaddr,
- PAGE_SIZE)) {
- kunmap(page);
- page_cache_release(page);
- goto end_coredump;
- }
- kunmap(page);
- }
+ int stop;
+
+ page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ stop = ((size += PAGE_SIZE) > limit) ||
+ !dump_write(file, kaddr, PAGE_SIZE);
+ kunmap(page);
page_cache_release(page);
- }
+ } else
+ stop = !dump_seek(file, PAGE_SIZE);
+ if (stop)
+ goto end_coredump;
}
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
}
stack_size = exec_params.stack_size;
- if (stack_size < interp_params.stack_size)
- stack_size = interp_params.stack_size;
-
if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
executable_stack = EXSTACK_ENABLE_X;
else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
executable_stack = EXSTACK_DISABLE_X;
- else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
- executable_stack = EXSTACK_ENABLE_X;
- else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
- executable_stack = EXSTACK_DISABLE_X;
else
executable_stack = EXSTACK_DEFAULT;
+ if (stack_size == 0) {
+ stack_size = interp_params.stack_size;
+ if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+ executable_stack = EXSTACK_ENABLE_X;
+ else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+ executable_stack = EXSTACK_DISABLE_X;
+ else
+ executable_stack = EXSTACK_DEFAULT;
+ }
+
retval = -ENOEXEC;
if (stack_size == 0)
goto error;
@@ -1325,9 +1328,6 @@ static int writenote(struct memelfnote *men, struct file *file)
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
{
@@ -1518,6 +1518,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
unsigned long *limit, unsigned long mm_flags)
{
struct vm_area_struct *vma;
+ int err = 0;
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
unsigned long addr;
@@ -1525,43 +1526,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
if (!maydump(vma, mm_flags))
continue;
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE
- ) {
- struct vm_area_struct *vma;
- struct page *page;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &vma) <= 0) {
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else if (page == ZERO_PAGE(0)) {
- page_cache_release(page);
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else {
- void *kaddr;
-
- flush_cache_page(vma, addr, page_to_pfn(page));
- kaddr = kmap(page);
- if ((*size += PAGE_SIZE) > *limit ||
- !dump_write(file, kaddr, PAGE_SIZE)
- ) {
- kunmap(page);
- page_cache_release(page);
- return -EIO;
- }
+ for (addr = vma->vm_start; addr < vma->vm_end;
+ addr += PAGE_SIZE) {
+ struct page *page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ *size += PAGE_SIZE;
+ if (*size > *limit)
+ err = -EFBIG;
+ else if (!dump_write(file, kaddr, PAGE_SIZE))
+ err = -EIO;
kunmap(page);
page_cache_release(page);
- }
+ } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+ err = -EFBIG;
+ if (err)
+ goto out;
}
}
-
- return 0;
-
-end_coredump:
- return -EFBIG;
+out:
+ return err;
}
#endif
@@ -1802,7 +1786,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
goto end_coredump;
}
- DUMP_SEEK(dataoff);
+ if (!dump_seek(file, dataoff))
+ goto end_coredump;
if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
goto end_coredump;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
if (ret <= 0)
break;
- if (ret >= (unsigned long) -4096)
- break;
len -= ret;
strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
"(%d != %d)", (unsigned) r, curid, id);
goto failed;
} else if ( ! p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) > (unsigned long) -4096) {
+ IS_ERR_VALUE(load_flat_shared_library(id, p))) {
printk("BINFMT_FLAT: failed to load library %d", id);
goto failed;
}
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_EXECUTABLE, 0);
up_write(&current->mm->mmap_sem);
- if (!textpos || textpos >= (unsigned long) -4096) {
+ if (!textpos || IS_ERR_VALUE(textpos)) {
if (!textpos)
textpos = (unsigned long) -ENOMEM;
printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
up_write(&current->mm->mmap_sem);
- if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
+ if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
if (!realdatastart)
realdatastart = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
result = bprm->file->f_op->read(bprm->file, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), &fpos);
}
- if (result >= (unsigned long)-4096) {
+ if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
do_munmap(current->mm, textpos, text_len);
do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
up_write(&current->mm->mmap_sem);
- if (!textpos || textpos >= (unsigned long) -4096) {
+ if (!textpos || IS_ERR_VALUE(textpos)) {
if (!textpos)
textpos = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
fpos = 0;
result = bprm->file->f_op->read(bprm->file,
(char *) textpos, text_len, &fpos);
- if (result < (unsigned long) -4096)
+ if (!IS_ERR_VALUE(result))
result = decompress_exec(bprm, text_len, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), 0);
}
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
fpos = 0;
result = bprm->file->f_op->read(bprm->file,
(char *) textpos, text_len, &fpos);
- if (result < (unsigned long) -4096) {
+ if (!IS_ERR_VALUE(result)) {
fpos = ntohl(hdr->data_start);
result = bprm->file->f_op->read(bprm->file, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), &fpos);
}
}
- if (result >= (unsigned long)-4096) {
+ if (IS_ERR_VALUE(result)) {
printk("Unable to read code+data+bss, errno %d\n",(int)-result);
do_munmap(current->mm, textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
res = prepare_binprm(&bprm);
- if (res <= (unsigned long)-4096)
+ if (!IS_ERR_VALUE(res))
res = load_flat_file(&bprm, libs, id, NULL);
abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
- if (res > (unsigned long)-4096)
+ if (IS_ERR_VALUE(res))
return res;
/* Update data segment pointers for all libraries */
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e8..12da5db8682c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
mempool_free(p, bs->bio_pool);
}
+EXPORT_SYMBOL(bio_free);
void bio_init(struct bio *bio)
{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1);
}
+EXPORT_SYMBOL(bio_init);
/**
* bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
mempool_free(p, bs->bio_pool);
return NULL;
}
+EXPORT_SYMBOL(bio_alloc_bioset);
static void bio_fs_destructor(struct bio *bio)
{
@@ -322,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio)
* @gfp_mask: allocation mask to use
* @nr_iovecs: number of iovecs
*
- * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask
- * contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ * bio_alloc will allocate a bio and associated bio_vec array that can hold
+ * at least @nr_iovecs entries. Allocations will be done from the
+ * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
+ *
+ * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ * a bio. This is due to the mempool guarantees. To make this work, callers
+ * must never allocate more than 1 bio at a time from this pool. Callers
+ * that need to allocate more than 1 bio must always submit the previously
+ * allocated bio for IO before attempting to allocate a new one. Failure to
+ * do so can cause livelocks under memory pressure.
*
* RETURNS:
* Pointer to new bio on success, NULL on failure.
@@ -337,6 +348,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
return bio;
}
+EXPORT_SYMBOL(bio_alloc);
static void bio_kmalloc_destructor(struct bio *bio)
{
@@ -346,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio)
}
/**
- * bio_alloc - allocate a bio for I/O
+ * bio_kmalloc - allocate a bio for I/O using kmalloc()
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
*
* Description:
- * bio_alloc will allocate a bio and associated bio_vec array that can hold
- * at least @nr_iovecs entries. Allocations will be done from the
- * fs_bio_set. Also see @bio_alloc_bioset.
- *
- * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- * a bio. This is due to the mempool guarantees. To make this work, callers
- * must never allocate more than 1 bio at a time from this pool. Callers
- * that need to allocate more than 1 bio must always submit the previously
- * allocated bio for IO before attempting to allocate a new one. Failure to
- * do so can cause livelocks under memory pressure.
+ * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
+ * %__GFP_WAIT, the allocation is guaranteed to succeed.
*
**/
struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
return bio;
}
+EXPORT_SYMBOL(bio_kmalloc);
void zero_fill_bio(struct bio *bio)
{
@@ -402,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio);
*
* Description:
* Put a reference to a &struct bio, either one you have gotten with
- * bio_alloc or bio_get. The last put of a bio will free it.
+ * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
**/
void bio_put(struct bio *bio)
{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
bio->bi_destructor(bio);
}
}
+EXPORT_SYMBOL(bio_put);
inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
return bio->bi_phys_segments;
}
+EXPORT_SYMBOL(bio_phys_segments);
/**
* __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
bio->bi_size = bio_src->bi_size;
bio->bi_idx = bio_src->bi_idx;
}
+EXPORT_SYMBOL(__bio_clone);
/**
* bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
+EXPORT_SYMBOL(bio_clone);
/**
* bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
return nr_pages;
}
+EXPORT_SYMBOL(bio_get_nr_vecs);
static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
*page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
return __bio_add_page(q, bio, page, len, offset,
queue_max_hw_sectors(q));
}
+EXPORT_SYMBOL(bio_add_pc_page);
/**
* bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
}
+EXPORT_SYMBOL(bio_add_page);
struct bio_map_data {
struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
bio_put(bio);
return ret;
}
+EXPORT_SYMBOL(bio_uncopy_user);
/**
* bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
}
+EXPORT_SYMBOL(bio_copy_user);
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
}
+EXPORT_SYMBOL(bio_map_user);
/**
* bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
__bio_unmap_user(bio);
bio_put(bio);
}
+EXPORT_SYMBOL(bio_unmap_user);
static void bio_map_kern_endio(struct bio *bio, int err)
{
bio_put(bio);
}
-
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
bio_put(bio);
return ERR_PTR(-EINVAL);
}
+EXPORT_SYMBOL(bio_map_kern);
static void bio_copy_kern_endio(struct bio *bio, int err)
{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
return bio;
}
+EXPORT_SYMBOL(bio_copy_kern);
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
if (bio->bi_end_io)
bio->bi_end_io(bio, error);
}
+EXPORT_SYMBOL(bio_endio);
void bio_pair_release(struct bio_pair *bp)
{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
mempool_free(bp, bp->bio2.bi_private);
}
}
+EXPORT_SYMBOL(bio_pair_release);
static void bio_pair_end_1(struct bio *bi, int err)
{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
return bp;
}
+EXPORT_SYMBOL(bio_split);
/**
* bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
kfree(bs);
}
+EXPORT_SYMBOL(bioset_free);
/**
* bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
bioset_free(bs);
return NULL;
}
+EXPORT_SYMBOL(bioset_create);
static void __init biovec_init_slabs(void)
{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
return 0;
}
-
subsys_initcall(init_bio);
-
-EXPORT_SYMBOL(bio_alloc);
-EXPORT_SYMBOL(bio_kmalloc);
-EXPORT_SYMBOL(bio_put);
-EXPORT_SYMBOL(bio_free);
-EXPORT_SYMBOL(bio_endio);
-EXPORT_SYMBOL(bio_init);
-EXPORT_SYMBOL(__bio_clone);
-EXPORT_SYMBOL(bio_clone);
-EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_add_page);
-EXPORT_SYMBOL(bio_add_pc_page);
-EXPORT_SYMBOL(bio_get_nr_vecs);
-EXPORT_SYMBOL(bio_map_user);
-EXPORT_SYMBOL(bio_unmap_user);
-EXPORT_SYMBOL(bio_map_kern);
-EXPORT_SYMBOL(bio_copy_kern);
-EXPORT_SYMBOL(bio_pair_release);
-EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_copy_user);
-EXPORT_SYMBOL(bio_uncopy_user);
-EXPORT_SYMBOL(bioset_create);
-EXPORT_SYMBOL(bioset_free);
-EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..8bed0557d88c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
* freeze_bdev -- lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- bdev->bd_fsfreeze_count++;
+ if (++bdev->bd_fsfreeze_count > 1) {
+ /*
+ * We don't even need to grab a reference - the first call
+ * to freeze_bdev grab an active reference and only the last
+ * thaw_bdev drops it.
+ */
sb = get_super(bdev);
+ drop_super(sb);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
- bdev->bd_fsfreeze_count++;
-
- down(&bdev->bd_mount_sem);
- sb = get_super(bdev);
- if (sb && !(sb->s_flags & MS_RDONLY)) {
- sb->s_frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
- sync_filesystem(sb);
-
- sb->s_frozen = SB_FREEZE_TRANS;
- smp_wmb();
-
- sync_blockdev(sb->s_bdev);
-
- if (sb->s_op->freeze_fs) {
- error = sb->s_op->freeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem freeze failed\n");
- sb->s_frozen = SB_UNFROZEN;
- drop_super(sb);
- up(&bdev->bd_mount_sem);
- bdev->bd_fsfreeze_count--;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return ERR_PTR(error);
- }
+
+ sb = get_active_super(bdev);
+ if (!sb)
+ goto out;
+ if (sb->s_flags & MS_RDONLY) {
+ deactivate_locked_super(sb);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return sb;
+ }
+
+ sb->s_frozen = SB_FREEZE_WRITE;
+ smp_wmb();
+
+ sync_filesystem(sb);
+
+ sb->s_frozen = SB_FREEZE_TRANS;
+ smp_wmb();
+
+ sync_blockdev(sb->s_bdev);
+
+ if (sb->s_op->freeze_fs) {
+ error = sb->s_op->freeze_fs(sb);
+ if (error) {
+ printk(KERN_ERR
+ "VFS:Filesystem freeze failed\n");
+ sb->s_frozen = SB_UNFROZEN;
+ deactivate_locked_super(sb);
+ bdev->bd_fsfreeze_count--;
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return ERR_PTR(error);
}
}
+ up_write(&sb->s_umount);
+ out:
sync_blockdev(bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
- return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
+ return sb; /* thaw_bdev releases s->s_umount */
}
EXPORT_SYMBOL(freeze_bdev);
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
*/
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
- int error = 0;
+ int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return -EINVAL;
- }
-
- bdev->bd_fsfreeze_count--;
- if (bdev->bd_fsfreeze_count > 0) {
- if (sb)
- drop_super(sb);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return 0;
- }
-
- if (sb) {
- BUG_ON(sb->s_bdev != bdev);
- if (!(sb->s_flags & MS_RDONLY)) {
- if (sb->s_op->unfreeze_fs) {
- error = sb->s_op->unfreeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem thaw failed\n");
- sb->s_frozen = SB_FREEZE_TRANS;
- bdev->bd_fsfreeze_count++;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
- }
- }
- sb->s_frozen = SB_UNFROZEN;
- smp_wmb();
- wake_up(&sb->s_wait_unfrozen);
+ if (!bdev->bd_fsfreeze_count)
+ goto out_unlock;
+
+ error = 0;
+ if (--bdev->bd_fsfreeze_count > 0)
+ goto out_unlock;
+
+ if (!sb)
+ goto out_unlock;
+
+ BUG_ON(sb->s_bdev != bdev);
+ down_write(&sb->s_umount);
+ if (sb->s_flags & MS_RDONLY)
+ goto out_deactivate;
+
+ if (sb->s_op->unfreeze_fs) {
+ error = sb->s_op->unfreeze_fs(sb);
+ if (error) {
+ printk(KERN_ERR
+ "VFS:Filesystem thaw failed\n");
+ sb->s_frozen = SB_FREEZE_TRANS;
+ bdev->bd_fsfreeze_count++;
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return error;
}
- drop_super(sb);
}
- up(&bdev->bd_mount_sem);
+ sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
+
+out_deactivate:
+ if (sb)
+ deactivate_locked_super(sb);
+out_unlock:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return 0;
}
@@ -420,7 +427,6 @@ static void bdev_destroy_inode(struct inode *inode)
{
struct bdev_inode *bdi = BDEV_I(inode);
- bdi->bdev.bd_inode_backing_dev_info = NULL;
kmem_cache_free(bdev_cachep, bdi);
}
@@ -431,7 +437,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
@@ -1115,7 +1120,7 @@ EXPORT_SYMBOL(revalidate_disk);
int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
- struct block_device_operations * bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
if (!bdops->media_changed)
return 0;
@@ -1243,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
} else {
- put_disk(disk);
module_put(disk->fops->owner);
+ put_disk(disk);
disk = NULL;
if (bdev->bd_contains == bdev) {
if (bdev->bd_disk->fops->open) {
@@ -1405,6 +1410,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
}
/*
+ * Write data to the block device. Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
+/*
* Try to release a page associated with block device when the system
* is under memory pressure.
*/
@@ -1436,7 +1468,7 @@ const struct file_operations def_blk_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write_nolock,
+ .aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = block_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..361604244271 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
#include "btrfs_inode.h"
#include "xattr.h"
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
.set = btrfs_xattr_acl_access_set,
};
-#else /* CONFIG_FS_POSIX_ACL */
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
int btrfs_acl_chmod(struct inode *inode)
{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
return 0;
}
-#endif /* CONFIG_FS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449ab..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
/* number of things on the pending list */
atomic_t num_pending;
+ /* reference counter for this struct */
+ atomic_t refs;
+
unsigned long sequence;
/* protects the pending list. */
@@ -61,6 +64,51 @@ struct btrfs_worker_thread {
};
/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time. It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations. This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+ struct btrfs_work work;
+ struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+ struct worker_start *start;
+ start = container_of(work, struct worker_start, work);
+ btrfs_start_workers(start->queue, 1);
+ kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+ struct worker_start *start;
+ int ret;
+
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return -ENOMEM;
+
+ start->work.func = start_new_worker_func;
+ start->queue = queue;
+ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+ if (ret)
+ kfree(start);
+ return ret;
+}
+
+/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
*/
@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 1;
- list_move(&worker->worker_list, &worker->workers->idle_list);
+
+ /* the list may be empty if the worker is just starting */
+ if (!list_empty(&worker->worker_list)) {
+ list_move(&worker->worker_list,
+ &worker->workers->idle_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
+
+ if (!list_empty(&worker->worker_list)) {
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
+ struct btrfs_workers *workers = worker->workers;
unsigned long flags;
+ rmb();
+ if (!workers->atomic_start_pending)
+ return;
+
+ spin_lock_irqsave(&workers->lock, flags);
+ if (!workers->atomic_start_pending)
+ goto out;
+
+ workers->atomic_start_pending = 0;
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers)
+ goto out;
+
+ workers->num_workers_starting += 1;
+ spin_unlock_irqrestore(&workers->lock, flags);
+ start_new_worker(workers);
+ return;
+
+out:
+ spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ struct btrfs_work *work)
+{
if (!workers->ordered)
return 0;
set_bit(WORK_DONE_BIT, &work->flags);
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
while (1) {
if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
work->ordered_func(work);
/* now take the lock again and call the freeing code */
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
list_del(&work->order_list);
work->ordered_free(work);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
return 0;
}
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+ if (atomic_dec_and_test(&worker->refs))
+ kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+ int freeit = 0;
+
+ spin_lock_irq(&worker->lock);
+ spin_lock(&worker->workers->lock);
+ if (worker->workers->num_workers > 1 &&
+ worker->idle &&
+ !worker->working &&
+ !list_empty(&worker->worker_list) &&
+ list_empty(&worker->prio_pending) &&
+ list_empty(&worker->pending) &&
+ atomic_read(&worker->num_pending) == 0) {
+ freeit = 1;
+ list_del_init(&worker->worker_list);
+ worker->workers->num_workers--;
+ }
+ spin_unlock(&worker->workers->lock);
+ spin_unlock_irq(&worker->lock);
+
+ if (freeit)
+ put_worker(worker);
+ return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+ struct list_head *prio_head,
+ struct list_head *head)
+{
+ struct btrfs_work *work = NULL;
+ struct list_head *cur = NULL;
+
+ if(!list_empty(prio_head))
+ cur = prio_head->next;
+
+ smp_mb();
+ if (!list_empty(&worker->prio_pending))
+ goto refill;
+
+ if (!list_empty(head))
+ cur = head->next;
+
+ if (cur)
+ goto out;
+
+refill:
+ spin_lock_irq(&worker->lock);
+ list_splice_tail_init(&worker->prio_pending, prio_head);
+ list_splice_tail_init(&worker->pending, head);
+
+ if (!list_empty(prio_head))
+ cur = prio_head->next;
+ else if (!list_empty(head))
+ cur = head->next;
+ spin_unlock_irq(&worker->lock);
+
+ if (!cur)
+ goto out_fail;
+
+out:
+ work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+ return work;
+}
+
/*
* main loop for servicing work items
*/
static int worker_loop(void *arg)
{
struct btrfs_worker_thread *worker = arg;
- struct list_head *cur;
+ struct list_head head;
+ struct list_head prio_head;
struct btrfs_work *work;
+
+ INIT_LIST_HEAD(&head);
+ INIT_LIST_HEAD(&prio_head);
+
do {
- spin_lock_irq(&worker->lock);
-again_locked:
+again:
while (1) {
- if (!list_empty(&worker->prio_pending))
- cur = worker->prio_pending.next;
- else if (!list_empty(&worker->pending))
- cur = worker->pending.next;
- else
+
+
+ work = get_next_work(worker, &prio_head, &head);
+ if (!work)
break;
- work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags);
work->worker = worker;
- spin_unlock_irq(&worker->lock);
work->func(work);
@@ -175,9 +329,13 @@ again_locked:
*/
run_ordered_completions(worker->workers, work);
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
+ check_pending_worker_creates(worker);
+
}
+
+ spin_lock_irq(&worker->lock);
+ check_idle_worker(worker);
+
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
@@ -216,8 +374,10 @@ again_locked:
spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- goto again_locked;
+ !list_empty(&worker->prio_pending)) {
+ spin_unlock_irq(&worker->lock);
+ goto again;
+ }
/*
* this makes sure we get a wakeup when someone
@@ -226,8 +386,13 @@ again_locked:
worker->working = 0;
spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop())
- schedule();
+ if (!kthread_should_stop()) {
+ schedule_timeout(HZ * 120);
+ if (!worker->working &&
+ try_worker_shutdown(worker)) {
+ return 0;
+ }
+ }
}
__set_current_state(TASK_RUNNING);
}
@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
+ int can_stop;
+ spin_lock_irq(&workers->lock);
list_splice_init(&workers->idle_list, &workers->worker_list);
while (!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
worker = list_entry(cur, struct btrfs_worker_thread,
worker_list);
- kthread_stop(worker->task);
- list_del(&worker->worker_list);
- kfree(worker);
+
+ atomic_inc(&worker->refs);
+ workers->num_workers -= 1;
+ if (!list_empty(&worker->worker_list)) {
+ list_del_init(&worker->worker_list);
+ put_worker(worker);
+ can_stop = 1;
+ } else
+ can_stop = 0;
+ spin_unlock_irq(&workers->lock);
+ if (can_stop)
+ kthread_stop(worker->task);
+ spin_lock_irq(&workers->lock);
+ put_worker(worker);
}
+ spin_unlock_irq(&workers->lock);
return 0;
}
/*
* simple init on struct btrfs_workers
*/
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_helper)
{
workers->num_workers = 0;
+ workers->num_workers_starting = 0;
INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock);
+ spin_lock_init(&workers->order_lock);
workers->max_workers = max;
workers->idle_thresh = 32;
workers->name = name;
workers->ordered = 0;
+ workers->atomic_start_pending = 0;
+ workers->atomic_worker_start = async_helper;
}
/*
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+ int num_workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
+
atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
kfree(worker);
goto fail;
}
-
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irq(&workers->lock);
}
return 0;
@@ -316,6 +504,14 @@ fail:
return ret;
}
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting += num_workers;
+ spin_unlock_irq(&workers->lock);
+ return __btrfs_start_workers(workers, num_workers);
+}
+
/*
* run through the list and find a worker thread that doesn't have a lot
* to do right now. This can return null if we aren't yet at the thread
@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
struct list_head *next;
- int enforce_min = workers->num_workers < workers->max_workers;
+ int enforce_min;
+
+ enforce_min = (workers->num_workers + workers->num_workers_starting) <
+ workers->max_workers;
/*
* if we find an idle thread, don't move it to the end of the
@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
*/
next = workers->worker_list.next;
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- atomic_inc(&worker->num_pending);
worker->sequence++;
if (worker->sequence % workers->idle_thresh == 0)
@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
+ struct list_head *fallback;
again:
spin_lock_irqsave(&workers->lock, flags);
worker = next_worker(workers);
- spin_unlock_irqrestore(&workers->lock, flags);
if (!worker) {
- spin_lock_irqsave(&workers->lock, flags);
- if (workers->num_workers >= workers->max_workers) {
- struct list_head *fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the force one
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
- spin_unlock_irqrestore(&workers->lock, flags);
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers) {
+ goto fallback;
+ } else if (workers->atomic_worker_start) {
+ workers->atomic_start_pending = 1;
+ goto fallback;
} else {
+ workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
- btrfs_start_workers(workers, 1);
+ __btrfs_start_workers(workers, 1);
goto again;
}
}
+ goto found;
+
+fallback:
+ fallback = NULL;
+ /*
+ * we have failed to find any workers, just
+ * return the first one we can find.
+ */
+ if (!list_empty(&workers->worker_list))
+ fallback = workers->worker_list.next;
+ if (!list_empty(&workers->idle_list))
+ fallback = workers->idle_list.next;
+ BUG_ON(!fallback);
+ worker = list_entry(fallback,
+ struct btrfs_worker_thread, worker_list);
+found:
+ /*
+ * this makes sure the worker doesn't exit before it is placed
+ * onto a busy/idle list
+ */
+ atomic_inc(&worker->num_pending);
+ spin_unlock_irqrestore(&workers->lock, flags);
return worker;
}
@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
spin_lock(&worker->workers->lock);
worker->idle = 0;
list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
+ &worker->workers->worker_list);
spin_unlock(&worker->workers->lock);
}
if (!worker->working) {
@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
worker->working = 1;
}
- spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers);
if (workers->ordered) {
- spin_lock_irqsave(&workers->lock, flags);
+ /*
+ * you're not allowed to do ordered queues from an
+ * interrupt handler
+ */
+ spin_lock(&workers->order_lock);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
} else {
INIT_LIST_HEAD(&work->order_list);
}
@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
check_busy_worker(worker);
/*
@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
wake = 1;
worker->working = 1;
- spin_unlock_irqrestore(&worker->lock, flags);
-
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
+
out:
return 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
/* current number of running workers */
int num_workers;
+ int num_workers_starting;
+
/* max number of workers allowed. changed by btrfs_start_workers */
int max_workers;
@@ -73,6 +75,16 @@ struct btrfs_workers {
/* force completions in the order they were queued */
int ordered;
+ /* more workers required, but in an interrupt handler */
+ int atomic_start_pending;
+
+ /*
+ * are we allowed to sleep while starting workers or are we required
+ * to start them at a later time? If we can't sleep, this indicates
+ * which queue we need to use to schedule thread creation.
+ */
+ struct btrfs_workers *atomic_worker_start;
+
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
* idle thresh limit above.
@@ -90,6 +102,9 @@ struct btrfs_workers {
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
+ /* lock for the ordered lists */
+ spinlock_t order_lock;
+
/* extra name for this worker, used for current->name */
char *name;
};
@@ -97,7 +112,8 @@ struct btrfs_workers {
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_starter);
int btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0e..f6783a42f010 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
* transid of the trans_handle that last modified this inode
*/
u64 last_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ u64 last_sub_trans;
+
/*
* transid that last logged this inode
*/
@@ -128,6 +134,16 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
+ */
+ spinlock_t accounting_lock;
+ int reserved_extents;
+ int outstanding_extents;
+
+ /*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -138,6 +154,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned dummy_inode:1;
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a37..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fdcc0512d3a..ec96f3a6d536 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int split;
int num_doubles = 0;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ return -EOVERFLOW;
+
/* first try to make some room by pushing left and right */
if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
wret = push_leaf_right(trans, root, path, data_size, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84ca..444b3e9b92a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
@@ -670,21 +674,29 @@ struct btrfs_space_info {
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
-
- /* delalloc accounting */
- u64 bytes_delalloc; /* number of bytes reserved for allocation,
- this space is not necessarily reserved yet
- by the allocator */
+ u64 bytes_super; /* total bytes reserved for the super blocks */
+ u64 bytes_root; /* the number of bytes needed to commit a
+ transaction */
u64 bytes_may_use; /* number of bytes that may be used for
- delalloc */
+ delalloc/allocations */
+ u64 bytes_delalloc; /* number of bytes currently reserved for
+ delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
+ int force_delalloc; /* make people start doing filemap_flush until
+ we're under a threshold */
struct list_head list;
+ /* for controlling how we free up space for allocations */
+ wait_queue_head_t allocate_wait;
+ wait_queue_head_t flush_wait;
+ int allocating_chunk;
+ int flushing;
+
/* for block groups in our same type */
struct list_head block_groups;
spinlock_t lock;
@@ -726,6 +738,15 @@ enum btrfs_caching_type {
BTRFS_CACHE_FINISHED = 2,
};
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_block_group_cache *block_group;
+ u64 progress;
+ atomic_t count;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 bytes_super;
u64 flags;
u64 sectorsize;
int extents_thresh;
@@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
int dirty;
/* cache tracking stuff */
- wait_queue_head_t caching_q;
int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -782,13 +805,16 @@ struct btrfs_fs_info {
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
- struct extent_io_tree pinned_extents;
+ struct extent_io_tree freed_extents[2];
+ struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +848,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex drop_mutex;
struct mutex volume_mutex;
- struct mutex tree_reloc_mutex;
- struct rw_semaphore extent_commit_sem;
-
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -835,10 +857,16 @@ struct btrfs_fs_info {
* before jumping into the main commit.
*/
struct mutex ordered_operations_mutex;
+ struct rw_semaphore extent_commit_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ struct srcu_struct subvol_srcu;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
+ struct list_head caching_block_groups;
atomic_t nr_async_submits;
atomic_t async_submit_draining;
@@ -882,6 +910,7 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
+ struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
struct btrfs_workers endio_workers;
@@ -889,6 +918,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
+ struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -979,7 +1009,10 @@ struct btrfs_root {
atomic_t log_writers;
atomic_t log_commit[2];
unsigned long log_transid;
+ unsigned long last_log_commit;
unsigned long log_batch;
+ pid_t log_start_pid;
+ bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -996,10 +1029,12 @@ struct btrfs_root {
u32 stripesize;
u32 type;
- u64 highest_inode;
- u64 last_inode_alloc;
+
+ u64 highest_objectid;
int ref_cows;
int track_dirty;
+ int in_radix;
+
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1118,6 +1153,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
+#define BTRFS_MOUNT_DISCARD (1 << 10)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin);
+ struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent);
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id);
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len);
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key);
@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
int btrfs_search_root(struct btrfs_root *root, u64 search_start,
u64 *found_objectid);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
int btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
/* dir-item.c */
@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
u64 objectid, const char *name, int name_len,
int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len);
@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* inode-map.c */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
void btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2286,11 +2344,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_block);
+ u64 inline_limit, u64 *hint_block, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
/* acl.c */
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
int btrfs_check_acl(struct inode *inode, int mask);
#else
#define btrfs_check_acl NULL
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00c..f3a6075519cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return di;
+
+ path->slots[0]++;
+ }
+ return NULL;
+}
+
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..02b6afbd7450 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
struct extent_map *em;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
em->bdev =
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
goto out;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret)
em = ERR_PTR(ret);
@@ -772,7 +773,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
-static struct address_space_operations btree_aops = {
+static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
.writepage = btree_writepage,
.writepages = btree_writepages,
@@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
- buf->start + buf->len - 1, WB_SYNC_ALL);
+ return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
- buf->start, buf->start + buf->len - 1);
+ return filemap_fdatawait_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
- root->highest_inode = 0;
- root->last_inode_alloc = 0;
+ root->highest_objectid = 0;
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree.rb_node = NULL;
@@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
root->log_batch = 0;
root->log_transid = 0;
+ root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -952,14 +953,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
root, fs_info, objectid);
ret = btrfs_find_last_root(tree_root, objectid,
&root->root_item, &root->root_key);
+ if (ret > 0)
+ return -ENOENT;
BUG_ON(ret);
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
+ root->commit_root = btrfs_root_node(root);
return 0;
}
@@ -1085,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->last_log_commit = 0;
return 0;
}
@@ -1095,7 +1099,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
- u64 highest_inode;
u64 generation;
u32 blocksize;
int ret = 0;
@@ -1110,7 +1113,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
kfree(root);
return ERR_PTR(ret);
}
- goto insert;
+ goto out;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1123,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
path = btrfs_alloc_path();
BUG_ON(!path);
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret != 0) {
- if (ret > 0)
- ret = -ENOENT;
- goto out;
+ if (ret == 0) {
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
}
- l = path->nodes[0];
- read_extent_buffer(l, &root->root_item,
- btrfs_item_ptr_offset(l, path->slots[0]),
- sizeof(root->root_item));
- memcpy(&root->root_key, location, sizeof(*location));
- ret = 0;
-out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
if (ret) {
- kfree(root);
+ if (ret > 0)
+ ret = -ENOENT;
return ERR_PTR(ret);
}
+
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
-insert:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+out:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
root->ref_cows = 1;
- ret = btrfs_find_highest_inode(root, &highest_inode);
- if (ret == 0) {
- root->highest_inode = highest_inode;
- root->last_inode_alloc = highest_inode;
- }
- }
+
return root;
}
@@ -1187,39 +1181,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
-
+again:
+ spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)location->objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (root)
return root;
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret == 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ERR_PTR(ret);
+
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
+ WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto fail;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
if (ret) {
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
+ if (ret == -EEXIST) {
+ free_fs_root(root);
+ goto again;
+ }
+ goto fail;
}
- if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- BUG_ON(ret);
+
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid);
+ WARN_ON(ret);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY))
btrfs_orphan_cleanup(root);
- }
+
return root;
+fail:
+ free_fs_root(root);
+ return ERR_PTR(ret);
}
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
const char *name, int namelen)
{
+ return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
struct btrfs_root *root;
int ret;
@@ -1236,7 +1257,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#if 0
+
ret = btrfs_sysfs_add_root(root);
if (ret) {
free_extent_buffer(root->node);
@@ -1244,9 +1265,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#endif
root->in_sysfs = 1;
return root;
+#endif
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1346,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
offset = page_offset(page);
em_tree = &BTRFS_I(inode)->extent_tree;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em) {
__unplug_io_fn(bdi, page);
return;
@@ -1352,6 +1373,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
+ bdi->name = "btrfs";
bdi->capabilities = BDI_CAP_MAP_COPY;
err = bdi_init(bdi);
if (err)
@@ -1359,8 +1381,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
err = bdi_register(bdi, NULL, "btrfs-%d",
atomic_inc_return(&btrfs_bdi_num));
- if (err)
+ if (err) {
+ bdi_destroy(bdi);
return err;
+ }
bdi->ra_pages = default_backing_dev_info.ra_pages;
bdi->unplug_io_fn = btrfs_unplug_io_fn;
@@ -1450,9 +1474,12 @@ static int cleaner_kthread(void *arg)
break;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+ mutex_trylock(&root->fs_info->cleaner_mutex)) {
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ }
if (freezing(current)) {
refrigerator();
@@ -1557,15 +1584,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -ENOMEM;
goto fail;
}
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+
+ ret = init_srcu_struct(&fs_info->subvol_srcu);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ ret = setup_bdi(fs_info, &fs_info->bdi);
+ if (ret) {
+ err = ret;
+ goto fail_srcu;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bdi;
+ }
+
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1584,12 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
- if (setup_bdi(fs_info, &fs_info->bdi))
- goto fail_bdi;
- fs_info->btree_inode = new_inode(sb);
- fs_info->btree_inode->i_ino = 1;
- fs_info->btree_inode->i_nlink = 1;
- fs_info->metadata_ratio = 8;
+ fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1599,7 +1642,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ fs_info->btree_inode->i_nlink = 1;
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ insert_inode_hash(fs_info->btree_inode);
+
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree.rb_node = NULL;
- extent_io_tree_init(&fs_info->pinned_extents,
+ extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->freed_extents[1],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
- mutex_init(&fs_info->drop_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- mutex_init(&fs_info->tree_reloc_mutex);
init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->subvol_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_iput;
}
- /*
- * we need to start all the end_io workers up front because the
- * queue work function gets called at interrupt time, and so it
- * cannot dynamically grow.
- */
+ btrfs_init_workers(&fs_info->generic_worker,
+ "genwork", 1, NULL);
+
btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size));
+ fs_info->thread_pool_size),
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->enospc_workers, "enospc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->delalloc_workers.idle_thresh = 2;
fs_info->delalloc_workers.ordered = 1;
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size);
+ "endio-meta-write", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_write_workers.idle_thresh = 64;
- fs_info->endio_meta_write_workers.idle_thresh = 64;
+ fs_info->endio_write_workers.idle_thresh = 2;
+ fs_info->endio_meta_write_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
+ btrfs_start_workers(&fs_info->generic_worker, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_write_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_write_workers,
- fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+ btrfs_start_workers(&fs_info->endio_write_workers, 1);
+ btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
}
+ ret = btrfs_find_orphan_roots(tree_root);
+ BUG_ON(ret);
+
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_recover_relocation(tree_root);
BUG_ON(ret);
@@ -1959,6 +2020,7 @@ fail_chunk_root:
free_extent_buffer(chunk_root->node);
free_extent_buffer(chunk_root->commit_root);
fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -1967,6 +2029,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -1975,6 +2038,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
+fail_srcu:
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
kfree(extent_root);
kfree(tree_root);
@@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ synchronize_srcu(&fs_info->subvol_srcu);
+
+ free_fs_root(root);
+ return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
kill_anon_super(&root->anon_super);
}
- if (root->node)
- free_extent_buffer(root->node);
- if (root->commit_root)
- free_extent_buffer(root->commit_root);
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
kfree(root->name);
kfree(root);
- return 0;
}
static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *gang[8];
int i;
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (gang[0]->in_radix) {
+ btrfs_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ kfree(gang[0]);
+ }
+ }
+
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
@@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
root_objectid = gang[i]->root_key.objectid;
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root_objectid);
- BUG_ON(ret);
btrfs_orphan_cleanup(gang[i]);
}
root_objectid++;
@@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(root->fs_info->csum_root->commit_root);
btrfs_free_block_groups(root->fs_info);
- btrfs_free_pinned_extents(root->fs_info);
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4e..ba5c3fd5ab8c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
- fid->objectid = BTRFS_I(inode)->location.objectid;
+ fid->objectid = inode->i_ino;
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
}
static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation)
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
struct btrfs_root *root;
+ struct dentry *dentry;
struct inode *inode;
struct btrfs_key key;
+ int index;
+ int err = 0;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto fail;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ err = -ENOENT;
+ goto fail;
+ }
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(inode))
- return (void *)inode;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
- if (generation != inode->i_generation) {
+ if (check_generation && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
+ static struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
- int slot;
- u64 objectid;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
int ret;
path = btrfs_alloc_path();
- key.objectid = dir->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
- key.offset = (u64)-1;
+ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = dir->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- /* Error */
- btrfs_free_path(path);
- return ERR_PTR(ret);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
}
+
+ path->slots[0]--;
leaf = path->nodes[0];
- slot = path->slots[0];
- if (ret) {
- /* btrfs_search_slot() returns the slot where we'd want to
- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
- The _real_ backref, telling us what the parent inode
- _actually_ is, will be in the slot _before_ the one
- that btrfs_search_slot() returns. */
- if (!slot) {
- /* Unless there is _no_ key in the tree before... */
- btrfs_free_path(path);
- return ERR_PTR(-EIO);
- }
- slot--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
btrfs_free_path(path);
- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
- return ERR_PTR(-EINVAL);
-
- objectid = key.offset;
-
- /* If we are already at the root of a subvol, return the real root */
- if (objectid == dir->i_ino)
- return dget(dir->i_sb->s_root);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
- /* Build a new key for the inode item */
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-
- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
}
const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..94627c4cc193 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
#include "locking.h"
#include "free-space-cache.h"
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
-
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
+ struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
return ret;
}
-/*
- * We always set EXTENT_LOCKED for the super mirror extents so we don't
- * overwrite them, so those bits need to be unset. Also, if we are unmounting
- * with pinned extents still sitting there because we had a block group caching,
- * we need to clear those now, since we are done.
- */
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+static int add_excluded_extent(struct btrfs_root *root,
+ u64 start, u64 num_bytes)
{
- u64 start, end, last = 0;
- int ret;
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ set_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ return 0;
+}
- while (1) {
- ret = find_first_extent_bit(&info->pinned_extents, last,
- &start, &end,
- EXTENT_LOCKED|EXTENT_DIRTY);
- if (ret)
- break;
+static void free_excluded_extents(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 start, end;
- clear_extent_bits(&info->pinned_extents, start, end,
- EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
- last = end+1;
- }
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+
+ clear_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ clear_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
}
-static int remove_sb_from_cache(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache)
+static int exclude_super_stripes(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
BUG_ON(ret);
+
while (nr--) {
- try_lock_extent(&fs_info->pinned_extents,
- logical[nr],
- logical[nr] + stripe_len - 1, GFP_NOFS);
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, logical[nr],
+ stripe_len);
+ BUG_ON(ret);
}
+
kfree(logical);
}
-
return 0;
}
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_STARTED) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (atomic_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
int ret;
while (start < end) {
- ret = find_first_extent_bit(&info->pinned_extents, start,
+ ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY|EXTENT_LOCKED);
+ EXTENT_DIRTY | EXTENT_UPTODATE);
if (ret)
break;
@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
{
struct btrfs_block_group_cache *block_group = data;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 last = 0;
+ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
- int ret = 0;
- struct btrfs_key key;
struct extent_buffer *leaf;
- int slot;
+ struct btrfs_key key;
u64 total_found = 0;
-
- BUG_ON(!fs_info);
+ u64 last = 0;
+ u32 nritems;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- atomic_inc(&block_group->space_info->caching_threads);
+ exclude_super_stripes(extent_root, block_group);
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->bytes_super += block_group->bytes_super;
+ spin_unlock(&block_group->space_info->lock);
+
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
key.objectid = last;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ key.type = BTRFS_EXTENT_ITEM_KEY;
again:
+ mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
down_read(&fs_info->extent_commit_sem);
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto err;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
while (1) {
smp_mb();
- if (block_group->fs_info->closing > 1) {
+ if (fs_info->closing > 1) {
last = (u64)-1;
break;
}
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(fs_info->extent_root, path);
- if (ret < 0)
- goto err;
- else if (ret)
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = find_next_key(path, 0, &key);
+ if (ret)
break;
- if (need_resched() ||
- btrfs_transaction_in_commit(fs_info)) {
- leaf = path->nodes[0];
-
- /* this shouldn't happen, but if the
- * leaf is empty just move on.
- */
- if (btrfs_header_nritems(leaf) == 0)
- break;
- /*
- * we need to copy the key out so that
- * we are sure the next search advances
- * us forward in the btree.
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(fs_info->extent_root, path);
- up_read(&fs_info->extent_commit_sem);
+ caching_ctl->progress = last;
+ btrfs_release_path(extent_root, path);
+ up_read(&fs_info->extent_commit_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ if (btrfs_transaction_in_commit(fs_info))
schedule_timeout(1);
- goto again;
- }
+ else
+ cond_resched();
+ goto again;
+ }
+ if (key.objectid < block_group->key.objectid) {
+ path->slots[0]++;
continue;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid < block_group->key.objectid)
- goto next;
if (key.objectid >= block_group->key.objectid +
block_group->key.offset)
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
total_found += add_new_free_space(block_group,
fs_info, last,
key.objectid);
last = key.objectid + key.offset;
- }
- if (total_found > (1024 * 1024 * 2)) {
- total_found = 0;
- wake_up(&block_group->caching_q);
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
}
-next:
path->slots[0]++;
}
ret = 0;
@@ -352,33 +381,65 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
err:
btrfs_free_path(path);
up_read(&fs_info->extent_commit_sem);
- atomic_dec(&block_group->space_info->caching_threads);
- wake_up(&block_group->caching_q);
+ free_excluded_extents(extent_root, block_group);
+
+ mutex_unlock(&caching_ctl->mutex);
+ wake_up(&caching_ctl->wait);
+
+ put_caching_control(caching_ctl);
+ atomic_dec(&block_group->space_info->caching_threads);
return 0;
}
static int cache_block_group(struct btrfs_block_group_cache *cache)
{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
struct task_struct *tsk;
int ret = 0;
+ smp_mb();
+ if (cache->cached != BTRFS_CACHE_NO)
+ return 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ /* one for caching kthread, one for caching block group list */
+ atomic_set(&caching_ctl->count, 2);
+
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
spin_unlock(&cache->lock);
- return ret;
+ kfree(caching_ctl);
+ return 0;
}
+ cache->caching_ctl = caching_ctl;
cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
+ down_write(&fs_info->extent_commit_sem);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->extent_commit_sem);
+
+ atomic_inc(&cache->space_info->caching_threads);
+
tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
cache->key.objectid);
if (IS_ERR(tsk)) {
@@ -1507,22 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-#ifdef BIO_RW_DISCARD
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
}
-#endif
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
u64 num_bytes)
{
-#ifdef BIO_RW_DISCARD
int ret;
u64 map_length = num_bytes;
struct btrfs_multi_bio *multi = NULL;
+ if (!btrfs_test_opt(root, DISCARD))
+ return 0;
+
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
bytenr, &map_length, &multi, 0);
@@ -1542,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
return ret;
-#else
- return 0;
-#endif
}
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -1656,7 +1715,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
parent, ref_root, flags,
ref->objectid, ref->offset,
&ins, node->ref_mod);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent,
@@ -1782,7 +1840,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
extent_op->flags_to_set,
&extent_op->key,
ref->level, &ins);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
@@ -1817,16 +1874,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
+ int mark_free = 0;
+ struct extent_buffer *must_clean = NULL;
+
+ ret = pin_down_bytes(trans, root, NULL,
+ node->bytenr, node->num_bytes,
+ head->is_data, 1, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- btrfs_update_pinned_extents(root, node->bytenr,
- node->num_bytes, 1);
- update_reserved_extents(root, node->bytenr,
- node->num_bytes, 0);
+ if (mark_free) {
+ ret = btrfs_free_reserved_extent(root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2691,60 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
alloc_target);
}
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+ u64 num_bytes;
+ int level;
+
+ level = BTRFS_MAX_LEVEL - 2;
+ /*
+ * NOTE: these calculations are absolutely the worst possible case.
+ * This assumes that _every_ item we insert will require a new leaf, and
+ * that the tree has grown to its maximum level size.
+ */
+
+ /*
+ * for every item we insert we could insert both an extent item and a
+ * extent ref item. Then for ever item we insert, we will need to cow
+ * both the original leaf, plus the leaf to the left and right of it.
+ *
+ * Unless we are talking about the extent root, then we just want the
+ * number of items * 2, since we just need the extent item plus its ref.
+ */
+ if (root == root->fs_info->extent_root)
+ num_bytes = num_items * 2;
+ else
+ num_bytes = (num_items + (2 * num_items)) * 3;
+
+ /*
+ * num_bytes is total number of leaves we could need times the leaf
+ * size, and then for every leaf we could end up cow'ing 2 nodes per
+ * level, down to the leaf level.
+ */
+ num_bytes = (num_bytes * root->leafsize) +
+ (num_bytes * (level * 2)) * root->nodesize;
+
+ return num_bytes;
+}
+
/*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc. If we have less reserved credits than
+ * we have extents, this function does nothing.
*/
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
- u64 alloc_target, thresh;
- int committed = 0, ret;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
-again:
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+
spin_lock(&meta_sinfo->lock);
- if (!meta_sinfo->full)
- thresh = meta_sinfo->total_bytes * 80;
- else
- thresh = meta_sinfo->total_bytes * 95;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ if (BTRFS_I(inode)->reserved_extents <=
+ BTRFS_I(inode)->outstanding_extents) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ spin_unlock(&meta_sinfo->lock);
+ return 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ BTRFS_I(inode)->reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+
+ if (meta_sinfo->bytes_delalloc < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_delalloc = 0;
+ } else {
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+ u64 thresh;
+
+ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use;
+
+ thresh = meta_sinfo->total_bytes - thresh;
+ thresh *= 80;
do_div(thresh, 100);
+ if (thresh <= meta_sinfo->bytes_delalloc)
+ meta_sinfo->force_delalloc = 1;
+ else
+ meta_sinfo->force_delalloc = 0;
+}
- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
- struct btrfs_trans_handle *trans;
- if (!meta_sinfo->full) {
- meta_sinfo->force_alloc = 1;
- spin_unlock(&meta_sinfo->lock);
+struct async_flush {
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+ struct btrfs_work work;
+};
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+ struct async_flush *async;
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, alloc_target, 0);
- btrfs_end_transaction(trans, root);
+ async = container_of(work, struct async_flush, work);
+ root = async->root;
+ info = async->info;
+
+ btrfs_start_delalloc_inodes(root);
+ wake_up(&info->flush_wait);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+
+ kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+ DEFINE_WAIT(wait);
+ u64 used;
+
+ while (1) {
+ prepare_to_wait(&info->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&info->lock);
+ if (!info->flushing) {
+ spin_unlock(&info->lock);
+ break;
+ }
+
+ used = info->bytes_used + info->bytes_reserved +
+ info->bytes_pinned + info->bytes_readonly +
+ info->bytes_super + info->bytes_root +
+ info->bytes_may_use + info->bytes_delalloc;
+ if (used < info->total_bytes) {
+ spin_unlock(&info->lock);
+ break;
+ }
+ spin_unlock(&info->lock);
+ schedule();
+ }
+ finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct async_flush *async;
+ bool wait = false;
+
+ spin_lock(&info->lock);
+
+ if (!info->flushing) {
+ info->flushing = 1;
+ init_waitqueue_head(&info->flush_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_on_flush(info);
+ return;
+ }
+
+ async = kzalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ goto flush;
+
+ async->root = root;
+ async->info = info;
+ async->work.func = flush_delalloc_async;
+
+ btrfs_queue_worker(&root->fs_info->enospc_workers,
+ &async->work);
+ wait_on_flush(info);
+ return;
+
+flush:
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+}
+
+static int maybe_allocate_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ bool wait = false;
+ int ret = 0;
+ u64 min_metadata;
+ u64 free_space;
+
+ free_space = btrfs_super_total_bytes(disk_super);
+ /*
+ * we allow the metadata to grow to a max of either 10gb or 5% of the
+ * space in the volume.
+ */
+ min_metadata = min((u64)10 * 1024 * 1024 * 1024,
+ div64_u64(free_space * 5, 100));
+ if (info->total_bytes >= min_metadata) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (info->full) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (!info->allocating_chunk) {
+ info->force_alloc = 1;
+ info->allocating_chunk = 1;
+ init_waitqueue_head(&info->allocate_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->allocate_wait,
+ !info->allocating_chunk);
+ return 1;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 4096 + 2 * 1024 * 1024,
+ info->flags, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+out:
+ spin_lock(&info->lock);
+ info->allocating_chunk = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->allocate_wait);
+
+ if (ret)
+ return 0;
+ return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int flushed = 0;
+ int force_delalloc;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ force_delalloc = meta_sinfo->force_delalloc;
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!flushed)
+ meta_sinfo->bytes_delalloc += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ flushed++;
+
+ if (flushed == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ flushed++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (flushed == 2) {
+ filemap_flush(inode->i_mapping);
+ goto again;
+ } else if (flushed == 3) {
+ flush_delalloc(root, meta_sinfo);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
+ printk(KERN_ERR "enospc, has %d, reserved %d\n",
+ BTRFS_I(inode)->outstanding_extents,
+ BTRFS_I(inode)->reserved_extents);
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
- if (!committed) {
- committed = 1;
- trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
+ BTRFS_I(inode)->reserved_extents++;
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ if (!flushed && force_delalloc)
+ filemap_flush(inode->i_mapping);
+
+ return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space. This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ if (meta_sinfo->bytes_may_use < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_may_use = 0;
+ } else {
+ meta_sinfo->bytes_may_use -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+/*
+ * Reserve some metadata space for use. We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items. If we
+ * have space, fantastic, if not, you get -ENOSPC. Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space. THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int retries = 0;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!retries)
+ meta_sinfo->bytes_may_use += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ retries++;
+ if (retries == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ retries++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (retries == 2) {
+ flush_delalloc(root, meta_sinfo);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_may_use -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+
+ dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
+
+ check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
@@ -2764,13 +3225,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
data_sinfo = BTRFS_I(inode)->space_info;
+ if (!data_sinfo)
+ goto alloc;
+
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
if (data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
- data_sinfo->bytes_may_use < bytes) {
+ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
struct btrfs_trans_handle *trans;
/*
@@ -2782,7 +3246,7 @@ again:
data_sinfo->force_alloc = 1;
spin_unlock(&data_sinfo->lock);
-
+alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
trans = btrfs_start_transaction(root, 1);
if (!trans)
@@ -2794,12 +3258,17 @@ again:
btrfs_end_transaction(trans, root);
if (ret)
return ret;
+
+ if (!data_sinfo) {
+ btrfs_set_inode_space_info(root, inode);
+ data_sinfo = BTRFS_I(inode)->space_info;
+ }
goto again;
}
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
- if (!committed) {
+ if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
if (!trans)
@@ -2827,7 +3296,7 @@ again:
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
- return btrfs_check_metadata_free_space(root);
+ return 0;
}
/*
@@ -2926,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(!space_info);
spin_lock(&space_info->lock);
- if (space_info->force_alloc) {
+ if (space_info->force_alloc)
force = 1;
- space_info->force_alloc = 0;
- }
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 6);
+ thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2950,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
- if (flags & BTRFS_BLOCK_GROUP_DATA) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
@@ -2958,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ space_info->force_alloc = 0;
+ spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
@@ -3008,10 +3478,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
old_val += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
if (cache->ro)
cache->space_info->bytes_readonly -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
@@ -3056,127 +3528,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin)
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache;
- if (pin)
- set_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + num - 1, GFP_NOFS);
-
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
- if (pin) {
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += len;
- cache->space_info->bytes_pinned += len;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fs_info->total_pinned += len;
- } else {
- int unpin = 0;
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
- /*
- * in order to not race with the block group caching, we
- * only want to unpin the extent if we are cached. If
- * we aren't cached, we want to start async caching this
- * block group so we can free the extent the next time
- * around.
- */
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- unpin = (cache->cached == BTRFS_CACHE_FINISHED);
- if (likely(unpin)) {
- cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- fs_info->total_pinned -= len;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
- if (likely(unpin))
- clear_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + len -1,
- GFP_NOFS);
- else
- cache_block_group(cache);
+ btrfs_put_block_group(cache);
- if (unpin)
- btrfs_add_free_space(cache, bytenr, len);
- }
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
+ set_extent_dirty(fs_info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ return 0;
+}
+
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ cache->reserved += num_bytes;
+ cache->space_info->bytes_reserved += num_bytes;
+ } else {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
}
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
return 0;
}
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve)
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_caching_control *next;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_block_group_cache *cache;
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
+ down_write(&fs_info->extent_commit_sem);
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += len;
- cache->space_info->bytes_reserved += len;
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ cache = caching_ctl->block_group;
+ if (block_group_cache_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ put_caching_control(caching_ctl);
} else {
- cache->reserved -= len;
- cache->space_info->bytes_reserved -= len;
+ cache->last_byte_to_unpin = caching_ctl->progress;
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
}
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ fs_info->pinned_extents = &fs_info->freed_extents[1];
+ else
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+ up_write(&fs_info->extent_commit_sem);
return 0;
}
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- u64 last = 0;
- u64 start;
- u64 end;
- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
- int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 len;
- while (1) {
- ret = find_first_extent_bit(pinned_extents, last,
- &start, &end, EXTENT_DIRTY);
- if (ret)
- break;
+ while (start <= end) {
+ if (!cache ||
+ start >= cache->key.objectid + cache->key.offset) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache);
+ }
+
+ len = cache->key.objectid + cache->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ if (start < cache->last_byte_to_unpin) {
+ len = min(len, cache->last_byte_to_unpin - start);
+ btrfs_add_free_space(cache, start, len);
+ }
- set_extent_dirty(copy, start, end, GFP_NOFS);
- last = end + 1;
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ start += len;
}
+
+ if (cache)
+ btrfs_put_block_group(cache);
return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin)
+ struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
+ else
+ unpin = &fs_info->freed_extents[0];
+
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3185,10 +3666,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
- /* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
-
+ unpin_extent_range(root, start, end);
cond_resched();
}
@@ -3198,7 +3677,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- u64 bytenr, u64 num_bytes, int is_data,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
struct extent_buffer **must_clean)
{
int err = 0;
@@ -3207,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
if (is_data)
goto pinit;
+ /*
+ * discard is sloooow, and so triggering discards on
+ * individual btree blocks isn't a good plan. Just
+ * pin everything in discard mode.
+ */
+ if (btrfs_test_opt(root, DISCARD))
+ goto pinit;
+
buf = btrfs_find_tree_block(root, bytenr, num_bytes);
if (!buf)
goto pinit;
@@ -3230,15 +3718,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
}
free_extent_buffer(buf);
pinit:
- btrfs_set_path_blocking(path);
+ if (path)
+ btrfs_set_path_blocking(path);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ btrfs_pin_extent(root, bytenr, num_bytes, reserved);
BUG_ON(err < 0);
return 0;
}
-
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -3412,7 +3900,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, &must_clean);
+ num_bytes, is_data, 0, &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
@@ -3543,8 +4031,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- update_reserved_extents(root, bytenr, num_bytes, 0);
+ btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3584,24 +4071,38 @@ static noinline int
wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
+ struct btrfs_caching_control *caching_ctl;
DEFINE_WAIT(wait);
- prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
-
- if (block_group_cache_done(cache)) {
- finish_wait(&cache->caching_q, &wait);
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
return 0;
- }
- schedule();
- finish_wait(&cache->caching_q, &wait);
- wait_event(cache->caching_q, block_group_cache_done(cache) ||
+ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
(cache->free_space >= num_bytes));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+ put_caching_control(caching_ctl);
return 0;
}
enum btrfs_loop_type {
- LOOP_CACHED_ONLY = 0,
+ LOOP_FIND_IDEAL = 0,
LOOP_CACHING_NOWAIT = 1,
LOOP_CACHING_WAIT = 2,
LOOP_ALLOC_CHUNK = 3,
@@ -3630,10 +4131,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group = NULL;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
+ int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
bool found_uncached_bg = false;
+ bool failed_cluster_refill = false;
+ bool failed_alloc = false;
+ u64 ideal_cache_percent = 0;
+ u64 ideal_cache_offset = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3669,14 +4175,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
empty_cluster = 0;
if (search_start == hint_byte) {
+ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
+ *
+ * However if we are re-searching with an ideal block group
+ * picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, data) &&
- block_group_cache_done(block_group)) {
+ (block_group->cached != BTRFS_CACHE_NO ||
+ search_start == ideal_cache_offset)) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -3688,13 +4199,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
*/
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
- } else
+ } else {
goto have_block_group;
+ }
} else if (block_group) {
btrfs_put_block_group(block_group);
}
}
-
search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -3706,32 +4217,58 @@ search:
have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ u64 free_percent;
+
+ free_percent = btrfs_block_group_used(&block_group->item);
+ free_percent *= 100;
+ free_percent = div64_u64(free_percent,
+ block_group->key.offset);
+ free_percent = 100 - free_percent;
+ if (free_percent > ideal_cache_percent &&
+ likely(!block_group->ro)) {
+ ideal_cache_offset = block_group->key.objectid;
+ ideal_cache_percent = free_percent;
+ }
+
/*
- * we want to start caching kthreads, but not too many
- * right off the bat so we don't overwhelm the system,
- * so only start them if there are less than 2 and we're
- * in the initial allocation phase.
+ * We only want to start kthread caching if we are at
+ * the point where we will wait for caching to make
+ * progress, or if our ideal search is over and we've
+ * found somebody to start caching.
*/
if (loop > LOOP_CACHING_NOWAIT ||
- atomic_read(&space_info->caching_threads) < 2) {
+ (loop > LOOP_FIND_IDEAL &&
+ atomic_read(&space_info->caching_threads) < 2)) {
ret = cache_block_group(block_group);
BUG_ON(ret);
}
- }
-
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached)) {
found_uncached_bg = true;
- /* if we only want cached bgs, loop */
- if (loop == LOOP_CACHED_ONLY)
+ /*
+ * If loop is set for cached only, try the next block
+ * group.
+ */
+ if (loop == LOOP_FIND_IDEAL)
goto loop;
}
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached))
+ found_uncached_bg = true;
+
if (unlikely(block_group->ro))
goto loop;
- if (last_ptr) {
+ /*
+ * Ok we want to try and use the cluster allocator, so lets look
+ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+ * have tried the cluster allocator plenty of times at this
+ * point and not have found anything, so we are likely way too
+ * fragmented for the clustering stuff to find anything, so lets
+ * just skip it and let the allocator find whatever block it can
+ * find
+ */
+ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -3806,9 +4343,11 @@ refill_cluster:
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
- } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT
+ && !failed_cluster_refill) {
spin_unlock(&last_ptr->refill_lock);
+ failed_cluster_refill = true;
wait_block_group_cache_progress(block_group,
num_bytes + empty_cluster + empty_size);
goto have_block_group;
@@ -3820,25 +4359,30 @@ refill_cluster:
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
- if (loop < LOOP_NO_EMPTY_SIZE) {
- btrfs_return_cluster_to_free_space(NULL,
- last_ptr);
- spin_unlock(&last_ptr->refill_lock);
- goto loop;
- }
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
spin_unlock(&last_ptr->refill_lock);
+ goto loop;
}
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
- if (!offset && (cached || (!cached &&
- loop == LOOP_CACHING_NOWAIT))) {
- goto loop;
- } else if (!offset && (!cached &&
- loop > LOOP_CACHING_NOWAIT)) {
+ /*
+ * If we didn't find a chunk, and we haven't failed on this
+ * block group before, and this block group is in the middle of
+ * caching and we are ok with waiting, then go ahead and wait
+ * for progress to be made, and set failed_alloc to true.
+ *
+ * If failed_alloc is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !failed_alloc && !cached &&
+ loop > LOOP_CACHING_NOWAIT) {
wait_block_group_cache_progress(block_group,
- num_bytes + empty_size);
+ num_bytes + empty_size);
+ failed_alloc = true;
goto have_block_group;
+ } else if (!offset) {
+ goto loop;
}
checks:
search_start = stripe_align(root, offset);
@@ -3880,16 +4424,22 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
+ update_reserved_extents(block_group, num_bytes, 1);
+
/* we are all good, lets return */
break;
loop:
+ failed_cluster_refill = false;
+ failed_alloc = false;
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
- /* LOOP_CACHED_ONLY, only search fully cached block groups
- * LOOP_CACHING_NOWAIT, search partially cached block groups, but
- * dont wait foR them to finish caching
+ /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
+ * for them to make caching progress. Also
+ * determine the best possible bg to cache
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -3898,12 +4448,47 @@ loop:
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
(found_uncached_bg || empty_size || empty_cluster ||
allowed_chunk_alloc)) {
- if (found_uncached_bg) {
+ if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
- if (loop < LOOP_CACHING_WAIT) {
- loop++;
+ loop++;
+ if (!ideal_cache_percent &&
+ atomic_read(&space_info->caching_threads))
goto search;
- }
+
+ /*
+ * 1 of the following 2 things have happened so far
+ *
+ * 1) We found an ideal block group for caching that
+ * is mostly full and will cache quickly, so we might
+ * as well wait for it.
+ *
+ * 2) We searched for cached only and we didn't find
+ * anything, and we didn't start any caching kthreads
+ * either, so chances are we will loop through and
+ * start a couple caching kthreads, and then come back
+ * around and just wait for them. This will be slower
+ * because we will have 2 caching kthreads reading at
+ * the same time when we could have just started one
+ * and waited for it to get far enough to give us an
+ * allocation, so go ahead and go to the wait caching
+ * loop.
+ */
+ loop = LOOP_CACHING_WAIT;
+ search_start = ideal_cache_offset;
+ ideal_cache_percent = 0;
+ goto ideal_cache;
+ } else if (loop == LOOP_FIND_IDEAL) {
+ /*
+ * Didn't find a uncached bg, wait on anything we find
+ * next.
+ */
+ loop = LOOP_CACHING_WAIT;
+ goto search;
+ }
+
+ if (loop < LOOP_CACHING_WAIT) {
+ loop++;
+ goto search;
}
if (loop == LOOP_ALLOC_CHUNK) {
@@ -3915,7 +4500,8 @@ loop:
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data, 1);
allowed_chunk_alloc = 0;
- } else {
+ done_chunk_alloc = 1;
+ } else if (!done_chunk_alloc) {
space_info->force_alloc = 1;
}
@@ -3940,21 +4526,32 @@ loop:
return ret;
}
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
- info->bytes_pinned - info->bytes_reserved),
+ info->bytes_pinned - info->bytes_reserved -
+ info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu\n",
+ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+ "\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used);
+ (unsigned long long)info->bytes_used,
+ (unsigned long long)info->bytes_root,
+ (unsigned long long)info->bytes_super,
+ (unsigned long long)info->bytes_reserved);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
@@ -3972,12 +4569,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
up_read(&info->groups_sem);
}
-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
{
int ret;
u64 search_start = 0;
@@ -4022,7 +4619,7 @@ again:
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes);
+ dump_space_info(sinfo, num_bytes, 1);
}
return ret;
@@ -4043,25 +4640,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
+ update_reserved_extents(cache, len, 0);
btrfs_put_block_group(cache);
- update_reserved_extents(root, start, len, 0);
-
- return ret;
-}
-
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
-{
- int ret;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
- empty_size, hint_byte, search_end, ins,
- data);
- if (!ret)
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
@@ -4222,15 +4802,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+ u64 start = ins->objectid;
+ u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
cache_block_group(block_group);
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ caching_ctl = get_caching_control(block_group);
- ret = btrfs_remove_free_space(block_group, ins->objectid,
- ins->offset);
- BUG_ON(ret);
+ if (!caching_ctl) {
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+
+ start = caching_ctl->progress;
+ num_bytes = ins->objectid + ins->offset -
+ caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ }
+
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+
+ update_reserved_extents(block_group, ins->offset, 1);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -4254,9 +4865,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u64 flags = 0;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+ empty_size, hint_byte, search_end,
+ ins, 0);
if (ret)
return ret;
@@ -4267,7 +4878,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
} else
BUG_ON(parent > 0);
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4346,452 +4956,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return buf;
}
-#if 0
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *leaf)
-{
- u64 disk_bytenr;
- u64 num_bytes;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- u32 nritems;
- int i;
- int ret;
-
- BUG_ON(!btrfs_is_leaf(leaf));
- nritems = btrfs_header_nritems(leaf);
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &key, i);
-
- /* only extents have references, skip everything else */
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
- continue;
-
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-
- /* inline extents live in the btree, they don't have refs */
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
-
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-
- /* holes don't have refs */
- if (disk_bytenr == 0)
- continue;
-
- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
- leaf->start, 0, key.objectid, 0);
- BUG_ON(ret);
- }
- return 0;
-}
-
-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_leaf_ref *ref)
-{
- int i;
- int ret;
- struct btrfs_extent_info *info;
- struct refsort *sorted;
-
- if (ref->nritems == 0)
- return 0;
-
- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
- for (i = 0; i < ref->nritems; i++) {
- sorted[i].bytenr = ref->extents[i].bytenr;
- sorted[i].slot = i;
- }
- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
-
- /*
- * the items in the ref were sorted when the ref was inserted
- * into the ref cache, so this is already in order
- */
- for (i = 0; i < ref->nritems; i++) {
- info = ref->extents + sorted[i].slot;
- ret = btrfs_free_extent(trans, root, info->bytenr,
- info->num_bytes, ref->bytenr,
- ref->owner, ref->generation,
- info->objectid, 0);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
- BUG_ON(ret);
- info++;
- }
-
- kfree(sorted);
- return 0;
-}
-
-
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 start,
- u64 len, u32 *refs)
-{
- int ret;
-
- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
- BUG_ON(ret);
-
-#if 0 /* some debugging code in case we see problems here */
- /* if the refs count is one, it won't get increased again. But
- * if the ref count is > 1, someone may be decreasing it at
- * the same time we are.
- */
- if (*refs != 1) {
- struct extent_buffer *eb = NULL;
- eb = btrfs_find_create_tree_block(root, start, len);
- if (eb)
- btrfs_tree_lock(eb);
-
- mutex_lock(&root->fs_info->alloc_mutex);
- ret = lookup_extent_ref(NULL, root, start, len, refs);
- BUG_ON(ret);
- mutex_unlock(&root->fs_info->alloc_mutex);
-
- if (eb) {
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- if (*refs == 1) {
- printk(KERN_ERR "btrfs block %llu went down to one "
- "during drop_snap\n", (unsigned long long)start);
- }
-
- }
-#endif
-
- cond_resched();
- return ret;
-}
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+};
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
-/*
- * this is used while deleting old snapshots, and it drops the refs
- * on a whole subtree starting from a level 1 node.
- *
- * The idea is to sort all the leaf pointers, and then drop the
- * ref on all the leaves in order. Most of the time the leaves
- * will have ref cache entries, so no leaf IOs will be required to
- * find the extents they have references on.
- *
- * For each leaf, any references it has are also dropped in order
- *
- * This ends up dropping the references in something close to optimal
- * order for reading and modifying the extent allocation tree.
- */
-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
{
u64 bytenr;
- u64 root_owner;
- u64 root_gen;
- struct extent_buffer *eb = path->nodes[1];
- struct extent_buffer *leaf;
- struct btrfs_leaf_ref *ref;
- struct refsort *sorted = NULL;
- int nritems = btrfs_header_nritems(eb);
+ u64 generation;
+ u64 refs;
+ u64 flags;
+ u64 last = 0;
+ u32 nritems;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
int ret;
- int i;
- int refi = 0;
- int slot = path->slots[1];
- u32 blocksize = btrfs_level_size(root, 0);
- u32 refs;
-
- if (nritems == 0)
- goto out;
-
- root_owner = btrfs_header_owner(eb);
- root_gen = btrfs_header_generation(eb);
- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+ int slot;
+ int nread = 0;
- /*
- * step one, sort all the leaf pointers so we don't scribble
- * randomly into the extent allocation tree
- */
- for (i = slot; i < nritems; i++) {
- sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
- sorted[refi].slot = i;
- refi++;
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
}
- /*
- * nritems won't be zero, but if we're picking up drop_snapshot
- * after a crash, slot might be > 0, so double check things
- * just in case.
- */
- if (refi == 0)
- goto out;
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+ blocksize = btrfs_level_size(root, wc->level - 1);
- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
- /*
- * the first loop frees everything the leaves point to
- */
- for (i = 0; i < refi; i++) {
- u64 ptr_gen;
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
- bytenr = sorted[i].bytenr;
+ if (slot == path->slots[wc->level])
+ goto reada;
- /*
- * check the reference count on this leaf. If it is > 1
- * we just decrement it below and don't update any
- * of the refs the leaf points to.
- */
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
- BUG_ON(ret);
- if (refs != 1)
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
continue;
- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
-
- /*
- * the leaf only had one reference, which means the
- * only thing pointing to this leaf is the snapshot
- * we're deleting. It isn't possible for the reference
- * count to increase again later
- *
- * The reference cache is checked for the leaf,
- * and if found we'll be able to drop any refs held by
- * the leaf without needing to read it in.
- */
- ref = btrfs_lookup_leaf_ref(root, bytenr);
- if (ref && ref->generation != ptr_gen) {
- btrfs_free_leaf_ref(root, ref);
- ref = NULL;
- }
- if (ref) {
- ret = cache_drop_leaf_ref(trans, root, ref);
- BUG_ON(ret);
- btrfs_remove_leaf_ref(root, ref);
- btrfs_free_leaf_ref(root, ref);
- } else {
- /*
- * the leaf wasn't in the reference cache, so
- * we have to read it.
- */
- leaf = read_tree_block(root, bytenr, blocksize,
- ptr_gen);
- ret = btrfs_drop_leaf_ref(trans, root, leaf);
- BUG_ON(ret);
- free_extent_buffer(leaf);
- }
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
- }
-
- /*
- * run through the loop again to free the refs on the leaves.
- * This is faster than doing it in the loop above because
- * the leaves are likely to be clustered together. We end up
- * working in nice chunks on the extent allocation tree.
- */
- for (i = 0; i < refi; i++) {
- bytenr = sorted[i].bytenr;
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, eb->start,
- root_owner, root_gen, 0, 1);
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &refs, &flags);
BUG_ON(ret);
+ BUG_ON(refs == 0);
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
- }
-out:
- kfree(sorted);
-
- /*
- * update the path to show we've processed the entire level 1
- * node. This will get saved into the root's drop_snapshot_progress
- * field so these drops are not repeated again if this transaction
- * commits.
- */
- path->slots[1] = nritems;
- return 0;
-}
-
-/*
- * helper function for drop_snapshot, this walks down the tree dropping ref
- * counts as it goes.
- */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level)
-{
- u64 root_owner;
- u64 root_gen;
- u64 bytenr;
- u64 ptr_gen;
- struct extent_buffer *next;
- struct extent_buffer *cur;
- struct extent_buffer *parent;
- u32 blocksize;
- int ret;
- u32 refs;
-
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
- path->nodes[*level]->len, &refs);
- BUG_ON(ret);
- if (refs > 1)
- goto out;
-
- /*
- * walk down to the last node level and free all the leaves
- */
- while (*level >= 0) {
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- cur = path->nodes[*level];
-
- if (btrfs_header_level(cur) != *level)
- WARN_ON(1);
-
- if (path->slots[*level] >=
- btrfs_header_nritems(cur))
- break;
+ if (wc->stage == DROP_REFERENCE) {
+ if (refs == 1)
+ goto reada;
- /* the new code goes down to level 1 and does all the
- * leaves pointed to that node in bulk. So, this check
- * for level 0 will always be false.
- *
- * But, the disk format allows the drop_snapshot_progress
- * field in the root to leave things in a state where
- * a leaf will need cleaning up here. If someone crashes
- * with the old code and then boots with the new code,
- * we might find a leaf here.
- */
- if (*level == 0) {
- ret = btrfs_drop_leaf_ref(trans, root, cur);
- BUG_ON(ret);
- break;
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
}
-
- /*
- * once we get to level one, process the whole node
- * at once, including everything below it.
- */
- if (*level == 1) {
- ret = drop_level_one_refs(trans, root, path);
- BUG_ON(ret);
+reada:
+ ret = readahead_tree_block(root, bytenr, blocksize,
+ generation);
+ if (ret)
break;
- }
-
- bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
-
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
- BUG_ON(ret);
-
- /*
- * if there is more than one reference, we don't need
- * to read that node to drop any references it has. We
- * just drop the ref we hold on that node and move on to the
- * next slot in this level.
- */
- if (refs != 1) {
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
- path->slots[*level]++;
-
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, parent->start,
- root_owner, root_gen,
- *level - 1, 1);
- BUG_ON(ret);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
- continue;
- }
-
- /*
- * we need to keep freeing things in the next level down.
- * read the block and loop around to process it
- */
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- WARN_ON(*level <= 0);
- if (path->nodes[*level-1])
- free_extent_buffer(path->nodes[*level-1]);
- path->nodes[*level-1] = next;
- *level = btrfs_header_level(next);
- path->slots[*level] = 0;
- cond_resched();
+ last = bytenr + blocksize;
+ nread++;
}
-out:
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
- if (path->nodes[*level] == root->node) {
- parent = path->nodes[*level];
- bytenr = path->nodes[*level]->start;
- } else {
- parent = path->nodes[*level + 1];
- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
- }
-
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- /*
- * cleanup and free the reference on the last node
- * we processed
- */
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
- parent->start, root_owner, root_gen,
- *level, 1);
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
-
- *level += 1;
- BUG_ON(ret);
-
- cond_resched();
- return 0;
+ wc->reada_slot = slot;
}
-#endif
-
-struct walk_control {
- u64 refs[BTRFS_MAX_LEVEL];
- u64 flags[BTRFS_MAX_LEVEL];
- struct btrfs_key update_progress;
- int stage;
- int level;
- int shared_level;
- int update_ref;
- int keep_locks;
-};
-
-#define DROP_REFERENCE 1
-#define UPDATE_BACKREF 2
/*
* hepler to process tree block while walking down the tree.
*
- * when wc->stage == DROP_REFERENCE, this function checks
- * reference count of the block. if the block is shared and
- * we need update back refs for the subtree rooted at the
- * block, this function changes wc->stage to UPDATE_BACKREF
- *
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
*
@@ -4800,11 +5066,10 @@ struct walk_control {
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc)
+ struct walk_control *wc, int lookup_info)
{
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
- struct btrfs_key key;
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
@@ -4816,8 +5081,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
ret = btrfs_lookup_extent_info(trans, root,
eb->start, eb->len,
@@ -4827,21 +5093,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(wc->refs[level] == 0);
}
- if (wc->stage == DROP_REFERENCE &&
- wc->update_ref && wc->refs[level] > 1) {
- BUG_ON(eb == root->node);
- BUG_ON(path->slots[level] > 0);
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
- else
- btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
- if (btrfs_header_owner(eb) == root->root_key.objectid &&
- btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level;
- }
- }
-
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level] > 1)
return 1;
@@ -4878,6 +5129,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int *lookup_info)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 parent;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ /*
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
+ return 1;
+ }
+
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ next = btrfs_find_tree_block(root, bytenr, blocksize);
+ if (!next) {
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level - 1] == 0);
+ *lookup_info = 0;
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level - 1] > 1) {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
+ }
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+ }
+
+ if (!btrfs_buffer_uptodate(next, generation)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ *lookup_info = 1;
+ }
+
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(root, bytenr, blocksize, generation);
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ }
+
+ level--;
+ BUG_ON(level != btrfs_header_level(next));
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
+ return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+ }
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ *lookup_info = 1;
+ return 1;
+}
+
+/*
* hepler to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
@@ -4904,7 +5285,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (level < wc->shared_level)
goto out;
- BUG_ON(wc->refs[level] <= 1);
ret = find_next_key(path, level + 1, &wc->update_progress);
if (ret > 0)
wc->update_ref = 0;
@@ -4935,8 +5315,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return 1;
}
- } else {
- BUG_ON(level != 0);
}
}
@@ -4989,39 +5367,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
- struct extent_buffer *next;
- struct extent_buffer *cur;
- u64 bytenr;
- u64 ptr_gen;
- u32 blocksize;
int level = wc->level;
+ int lookup_info = 1;
int ret;
while (level >= 0) {
- cur = path->nodes[level];
- BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
- ret = walk_down_proc(trans, root, path, wc);
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
if (ret > 0)
break;
if (level == 0)
break;
- bytenr = btrfs_node_blockptr(cur, path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
-
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
-
- level--;
- BUG_ON(level != btrfs_header_level(next));
- path->nodes[level] = next;
- path->slots[level] = 0;
- path->locks[level] = 1;
- wc->level = level;
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ if (ret > 0) {
+ path->slots[level]++;
+ continue;
+ }
+ level = wc->level;
}
return 0;
}
@@ -5111,9 +5478,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
err = ret;
goto out;
}
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
+ WARN_ON(ret > 0);
/*
* unlock our path, this is safe because only this
@@ -5148,6 +5513,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
ret = walk_down_tree(trans, root, path, wc);
@@ -5200,9 +5566,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
ret = btrfs_del_root(trans, tree_root, &root->root_key);
BUG_ON(ret);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root);
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+ NULL, NULL);
+ BUG_ON(ret < 0);
+ if (ret > 0) {
+ ret = btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ }
+
+ if (root->in_radix) {
+ btrfs_free_fs_root(tree_root->fs_info, root);
+ } else {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root);
+ }
out:
btrfs_end_transaction(trans, tree_root);
kfree(wc);
@@ -5254,6 +5635,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
wret = walk_down_tree(trans, root, path, wc);
@@ -5396,9 +5778,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
while (1) {
int ret;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -6841,287 +7223,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
return 0;
}
-#if 0
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
-{
- struct btrfs_path *path;
- struct btrfs_inode_item *item;
- struct extent_buffer *leaf;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_inode(trans, root, path, objectid);
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
- btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
{
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
- struct btrfs_key root_key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
- int err = 0;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ int full = 0;
+ int ret = 0;
- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ /* odd, couldn't find the block group, leave it alone */
+ if (!block_group)
+ return -1;
- err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
- if (err)
+ /* no bytes used, we're good */
+ if (!btrfs_block_group_used(&block_group->item))
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
- BUG_ON(err);
-
- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
- if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- BTRFS_I(inode)->location.objectid = objectid;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
- btrfs_read_locked_inode(inode);
- unlock_new_inode(inode);
- BUG_ON(is_bad_inode(inode));
- } else {
- BUG_ON(1);
- }
- BTRFS_I(inode)->index_cnt = group->key.objectid;
-
- err = btrfs_orphan_add(trans, inode);
-out:
- btrfs_end_transaction(trans, root);
- if (err) {
- if (inode)
- iput(inode);
- inode = ERR_PTR(err);
- }
- return inode;
-}
-
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-
- struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct list_head list;
- size_t offset;
- int ret;
- u64 disk_bytenr;
-
- INIT_LIST_HEAD(&list);
-
- ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list);
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del_init(&sums->list);
-
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ full = space_info->full;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ /*
+ * if this is the last block group we have in this space, we can't
+ * relocate it unless we're able to allocate a new chunk below.
+ *
+ * Otherwise, we need to make sure we have room in the space to handle
+ * all of the extents from this block group. If we can, we're good
+ */
+ if ((space_info->total_bytes != block_group->key.offset) &&
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ btrfs_block_group_used(&block_group->item) <
+ space_info->total_bytes)) {
+ spin_unlock(&space_info->lock);
+ goto out;
}
- btrfs_put_ordered_extent(ordered);
- return 0;
-}
-
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
- struct btrfs_fs_info *info = root->fs_info;
- struct extent_buffer *leaf;
- struct inode *reloc_inode;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_key key;
- u64 skipped;
- u64 cur_byte;
- u64 total_found;
- u32 nritems;
- int ret;
- int progress;
- int pass = 0;
-
- root = root->fs_info->extent_root;
-
- block_group = btrfs_lookup_block_group(info, group_start);
- BUG_ON(!block_group);
-
- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
- (unsigned long long)block_group->key.objectid,
- (unsigned long long)block_group->flags);
-
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- reloc_inode = create_reloc_inode(info, block_group);
- BUG_ON(IS_ERR(reloc_inode));
-
- __alloc_chunk_for_shrink(root, block_group, 1);
- set_block_group_readonly(block_group);
-
- btrfs_start_delalloc_inodes(info->tree_root);
- btrfs_wait_ordered_extents(info->tree_root, 0);
-again:
- skipped = 0;
- total_found = 0;
- progress = 0;
- key.objectid = block_group->key.objectid;
- key.offset = 0;
- key.type = 0;
- cur_byte = key.objectid;
-
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ spin_unlock(&space_info->lock);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(info->tree_root);
- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ /*
+ * ok we don't have enough space, but maybe we have free space on our
+ * devices to allocate new chunks for relocation, so loop through our
+ * alloc devices and guess if we have enough space. However, if we
+ * were marked as full, then we know there aren't enough chunks, and we
+ * can just return.
+ */
+ ret = -1;
+ if (full)
+ goto out;
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ mutex_lock(&root->fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ u64 min_free = btrfs_block_group_used(&block_group->item);
+ u64 dev_offset, max_avail;
- while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-next:
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
+ /*
+ * check to make sure we can actually find a chunk with enough
+ * space to fit our block group in.
+ */
+ if (device->total_bytes > device->bytes_used + min_free) {
+ ret = find_free_dev_extent(NULL, device, min_free,
+ &dev_offset, &max_avail);
+ if (!ret)
break;
- }
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
+ ret = -1;
}
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- if (progress && need_resched()) {
- btrfs_release_path(root, path);
- cond_resched();
- progress = 0;
- continue;
- }
- progress = 1;
-
- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
- key.objectid + key.offset <= cur_byte) {
- path->slots[0]++;
- goto next;
- }
-
- total_found++;
- cur_byte = key.objectid + key.offset;
- btrfs_release_path(root, path);
-
- __alloc_chunk_for_shrink(root, block_group, 0);
- ret = relocate_one_extent(root, path, &key, block_group,
- reloc_inode, pass);
- BUG_ON(ret < 0);
- if (ret > 0)
- skipped++;
-
- key.objectid = cur_byte;
- key.type = 0;
- key.offset = 0;
- }
-
- btrfs_release_path(root, path);
-
- if (pass == 0) {
- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
}
-
- if (total_found > 0) {
- printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
- (unsigned long long)total_found, pass);
- pass++;
- if (total_found == skipped && pass > 2) {
- iput(reloc_inode);
- reloc_inode = create_reloc_inode(info, block_group);
- pass = 0;
- }
- goto again;
- }
-
- /* delete reloc_inode */
- iput(reloc_inode);
-
- /* unpin extents in this range */
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
-
- spin_lock(&block_group->lock);
- WARN_ON(block_group->pinned > 0);
- WARN_ON(block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
- spin_unlock(&block_group->lock);
- btrfs_put_block_group(block_group);
- ret = 0;
+ mutex_unlock(&root->fs_info->chunk_mutex);
out:
- btrfs_free_path(path);
+ btrfs_put_block_group(block_group);
return ret;
}
-#endif
static int find_first_block_group(struct btrfs_root *root,
struct btrfs_path *path, struct btrfs_key *key)
@@ -7164,8 +7345,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
+ down_write(&info->extent_commit_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ }
+ up_write(&info->extent_commit_sem);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7179,8 +7370,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
@@ -7250,7 +7440,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
cache->fs_info = info;
- init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -7272,8 +7461,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
- remove_sb_from_cache(root, cache);
-
/*
* check for two cases, either we are full, and therefore
* don't need to bother with the caching work since we won't
@@ -7282,13 +7469,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* time, particularly in the full case.
*/
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ free_excluded_extents(root, cache);
} else if (btrfs_block_group_used(&cache->item) == 0) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, root->fs_info,
found_key.objectid,
found_key.objectid +
found_key.offset);
+ free_excluded_extents(root, cache);
}
ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7296,6 +7489,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
&space_info);
BUG_ON(ret);
cache->space_info = space_info;
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&space_info->groups_sem);
list_add_tail(&cache->list, &space_info->block_groups);
up_write(&space_info->groups_sem);
@@ -7345,7 +7542,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -7354,15 +7550,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- remove_sb_from_cache(root, cache);
+ exclude_super_stripes(root, cache);
add_new_free_space(cache, root->fs_info, chunk_offset,
chunk_offset + size);
+ free_excluded_extents(root, cache);
+
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
+
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&cache->space_info->groups_sem);
list_add_tail(&cache->list, &cache->space_info->block_groups);
up_write(&cache->space_info->groups_sem);
@@ -7428,8 +7632,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..96577e8bf9fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
+ state = NULL;
}
}
+
return 0;
}
-static void set_state_cb(struct extent_io_tree *tree,
+static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
- tree->ops->set_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
+ return tree->ops->set_bit_hook(tree->mapping->host,
+ state->start, state->end,
+ state->state, bits);
}
+
+ return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
- if (tree->ops && tree->ops->clear_bit_hook) {
- tree->ops->clear_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
- }
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
int bits)
{
struct rb_node *node;
+ int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
(unsigned long long)start);
WARN_ON(1);
}
+ state->start = start;
+ state->end = end;
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- set_state_cb(tree, state, bits);
state->state |= bits;
- state->start = start;
- state->end = end;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ return tree->ops->split_extent_hook(tree->mapping->host,
+ orig, split);
+ return 0;
+}
+
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state, int bits, int wake,
int delete)
{
- int ret = state->state & bits;
+ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int ret = state->state & bits_to_clear;
if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
- state->state &= ~bits;
+ state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
if (delete || state->state == 0) {
@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
* bits were already set, or zero if none of the bits were already set.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask)
+ int bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
{
struct extent_state *state;
+ struct extent_state *cached;
struct extent_state *prealloc = NULL;
+ struct rb_node *next_node;
struct rb_node *node;
u64 last_end;
int err;
@@ -488,6 +522,17 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+ *cached_state = NULL;
+ cached_state = NULL;
+ if (cached && cached->tree && cached->start == start) {
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ free_extent_state(cached);
+ }
/*
* this search will find the extents that end after
* our range starts
@@ -496,6 +541,7 @@ again:
if (!node)
goto out;
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
@@ -526,13 +572,11 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits,
- wake, delete);
+ set |= clear_state_bit(tree, state, bits, wake,
+ delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -547,19 +591,30 @@ again:
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
-
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits,
- wake, delete);
+
+ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
prealloc = NULL;
goto out;
}
+ if (state->end < end && prealloc && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
set |= clear_state_bit(tree, state, bits, wake, delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start <= end && next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
goto search_again;
out:
@@ -641,40 +696,59 @@ out:
return 0;
}
-static void set_state_bits(struct extent_io_tree *tree,
+static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
+ int ret;
+
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- set_state_cb(tree, state, bits);
state->state |= bits;
+
+ return 0;
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ *cached_ptr = state;
+ atomic_inc(&state->refs);
+ }
+ }
}
/*
- * set some bits on a range in the tree. This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
*
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set. The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
*
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
*/
+
static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive, u64 *failed_start,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state,
gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
int err = 0;
- int set;
u64 last_start;
u64 last_end;
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -683,6 +757,13 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start == start && state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
/*
* this search will find all the extents that end after
* our range starts.
@@ -694,8 +775,8 @@ again:
BUG_ON(err == -EEXIST);
goto out;
}
-
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
last_start = state->start;
last_end = state->end;
@@ -706,17 +787,32 @@ again:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set = state->state & bits;
- if (set && exclusive) {
+ struct rb_node *next_node;
+ if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
goto out;
}
- set_state_bits(tree, state, bits);
+
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
+
start = last_end + 1;
+ if (start < end && prealloc && !need_resched()) {
+ next_node = rb_next(node);
+ if (next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ }
goto search_again;
}
@@ -737,8 +833,7 @@ again:
* desired bit on it.
*/
if (state->start < start) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -749,13 +844,14 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -774,10 +870,13 @@ again:
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
- prealloc = NULL;
BUG_ON(err == -EEXIST);
- if (err)
+ if (err) {
+ prealloc = NULL;
goto out;
+ }
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -788,8 +887,7 @@ again:
* on the first half
*/
if (state->start <= end && state->end > end) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -797,7 +895,12 @@ again:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, bits);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
+ cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
- mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+ NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, 0, NULL,
- mask);
+ NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY,
- 0, NULL, mask);
+ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ 0, NULL, NULL, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+ NULL, mask);
}
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
- 0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ NULL, mask);
}
int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached_state, gfp_t mask)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, mask);
if (err == -EEXIST && (mask & __GFP_WAIT)) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
return err;
}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, mask);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, mask);
+ EXTENT_LOCKED, 1, 0, NULL, mask);
return 0;
}
return 1;
}
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ mask);
}
/*
@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_dirty(tree, start, end, GFP_NOFS);
return 0;
}
@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_writeback(tree, start, end, GFP_NOFS);
return 0;
}
@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 delalloc_start;
u64 delalloc_end;
u64 found;
+ struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1269,6 +1365,7 @@ again:
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
+ free_extent_state(cached_state);
if (!loops) {
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1379,21 @@ again:
BUG_ON(ret);
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end,
+ 0, &cached_state, GFP_NOFS);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1);
+ EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ unlock_extent_cached(tree, delalloc_start, delalloc_end,
+ &cached_state, GFP_NOFS);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
+ free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -1303,11 +1403,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_pages,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback)
+ unsigned long op)
{
int ret;
struct page *pages[16];
@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int i;
int clear_bits = 0;
- if (clear_unlock)
+ if (op & EXTENT_CLEAR_UNLOCK)
clear_bits |= EXTENT_LOCKED;
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_bits |= EXTENT_DIRTY;
- if (clear_delalloc)
+ if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+ if (op & EXTENT_CLEAR_ACCOUNTING)
+ clear_bits |= EXTENT_DO_ACCOUNTING;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+ EXTENT_SET_PRIVATE2)))
return 0;
while (nr_pages > 0) {
@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
for (i = 0; i < ret; i++) {
+
+ if (op & EXTENT_SET_PRIVATE2)
+ SetPagePrivate2(pages[i]);
+
if (pages[i] == locked_page) {
page_cache_release(pages[i]);
continue;
}
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_page_dirty_for_io(pages[i]);
- if (set_writeback)
+ if (op & EXTENT_SET_WRITEBACK)
set_page_writeback(pages[i]);
- if (end_writeback)
+ if (op & EXTENT_END_WRITEBACK)
end_page_writeback(pages[i]);
- if (unlock_pages)
+ if (op & EXTENT_CLEAR_UNLOCK_PAGE)
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
@@ -1476,14 +1581,17 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled)
+ int bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
int bitset = 0;
spin_lock(&tree->lock);
- node = tree_search(tree, start);
+ if (cached && cached->tree && cached->start == start)
+ node = &cached->rb_node;
+ else
+ node = tree_search(tree, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
bitset = 0;
break;
}
+
+ if (state->end == (u64)-1)
+ break;
+
start = state->end + 1;
if (start > end)
break;
@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
return 0;
}
@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
return 0;
}
@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
static int check_page_writeback(struct extent_io_tree *tree,
struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
- u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
- end_page_writeback(page);
+ end_page_writeback(page);
return 0;
}
@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
}
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ clear_extent_uptodate(tree, start, end, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
- clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
if (whole_page)
end_page_writeback(page);
else
@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+ if (test_range_bit(tree, cur, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 iosize;
u64 unlock_start;
sector_t sector;
+ struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
int ret;
@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
+ u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
* to this page.
@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
tree->ops->fill_delalloc(inode, page, delalloc_start,
delalloc_end, &page_started,
&nr_written);
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
delalloc_start = delalloc_end + 1;
}
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
/* did the fill delalloc function already unlock and start
* the IO?
@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done_unlocked;
}
}
- lock_extent(tree, start, page_end, GFP_NOFS);
-
- unlock_start = start;
-
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
if (ret == -EAGAIN) {
- unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
if (last_byte <= start) {
- clear_extent_dirty(tree, start, page_end, GFP_NOFS);
- unlock_extent(tree, start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- set_extent_uptodate(tree, start, page_end, GFP_NOFS);
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
if (cur >= last_byte) {
- clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- clear_extent_dirty(tree, cur,
- cur + iosize - 1, GFP_NOFS);
-
- unlock_extent(tree, unlock_start, cur + iosize - 1,
- GFP_NOFS);
-
/*
* end_io notification does not happen here for
* compressed extents
@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
/* leave this out until we have a page_mkwrite call */
if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0)) {
+ EXTENT_DIRTY, 0, NULL)) {
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
cur + iosize - 1);
@@ -2309,12 +2415,12 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (unlock_start <= page_end)
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page);
done_unlocked:
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
return 0;
}
@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
+ int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
scanned = 1;
}
retry:
- while (!done && (index <= end) &&
+ while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY, min(end - index,
(pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2518,15 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret || wbc->nr_to_write <= 0)
- done = 1;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
+ if (ret)
done = 1;
- }
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent(tree, start, end, GFP_NOFS);
- wait_on_extent_writeback(tree, start, end);
+ wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
- 1, 1, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
+ 1, 1, NULL, GFP_NOFS);
return 0;
}
@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
!isnew && !PageUptodate(page) &&
(block_off_end > to || block_off_start < from) &&
!test_range_bit(tree, block_start, cur_end,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
u64 sector;
u64 extent_offset = block_start - em->start;
size_t iosize;
@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
*/
set_extent_bit(tree, block_start,
block_start + iosize - 1,
- EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
int ret = 1;
if (test_range_bit(tree, start, end,
- EXTENT_IOBITS | EXTENT_ORDERED, 0))
+ EXTENT_IOBITS, 0, NULL))
ret = 0;
else {
if ((mask & GFP_NOFS) == GFP_NOFS)
mask = GFP_NOFS;
- clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- 1, 1, mask);
+ /*
+ * at this point we can safely clear everything except the
+ * locked bit and the nodatasum bit
+ */
+ clear_extent_bit(tree, start, end,
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+ 0, 0, NULL, mask);
}
return ret;
}
@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 len;
while (start <= end) {
len = end - start + 1;
- spin_lock(&map->lock);
+ write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
if (!em || IS_ERR(em)) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
break;
}
if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
em->start != start) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
free_extent_map(em);
break;
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
- EXTENT_LOCKED | EXTENT_WRITEBACK |
- EXTENT_ORDERED,
- 0)) {
+ EXTENT_LOCKED | EXTENT_WRITEBACK,
+ 0, NULL)) {
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
start = extent_map_end(em);
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
if (ret)
return 1;
while (start <= end) {
@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1);
+ EXTENT_UPTODATE, 1, NULL);
if (ret)
return ret;
@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return 0;
if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
return 0;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..36de250a7b2b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,9 @@
#define EXTENT_DEFRAG (1 << 6)
#define EXTENT_DEFRAG_DONE (1 << 7)
#define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
@@ -27,6 +26,16 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK 0x2
+#define EXTENT_CLEAR_DELALLOC 0x4
+#define EXTENT_CLEAR_DIRTY 0x8
+#define EXTENT_SET_WRITEBACK 0x10
+#define EXTENT_END_WRITEBACK 0x20
+#define EXTENT_SET_PRIVATE2 0x40
+#define EXTENT_CLEAR_ACCOUNTING 0x80
+
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
@@ -62,8 +71,13 @@ struct extent_io_ops {
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned long bits);
+ int (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ int (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
@@ -81,10 +95,14 @@ struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
+ u64 split_start;
+ u64 split_end;
/* for use by the FS */
u64 private;
@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached, gfp_t mask);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 max_bytes, unsigned long bits);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled);
+ int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask);
+ int bits, int wake, int delete, struct extent_state **cached,
+ gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_page,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback);
+ unsigned long op);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..ccbdcb54ec5d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
{
tree->map.rb_node = NULL;
- spin_lock_init(&tree->lock);
+ rwlock_init(&tree->lock);
}
/**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
return 0;
}
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *em;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(!em || em->start != start);
+
+ if (!em)
+ goto out;
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
ret = -EEXIST;
goto out;
}
- assert_spin_locked(&tree->lock);
rb = tree_insert(&tree->map, em->start, &em->rb_node);
if (rb) {
ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- assert_spin_locked(&tree->lock);
rb_node = __tree_search(&tree->map, start, &prev, &next);
if (!rb_node && prev) {
em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
}
/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+
+/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
* @em: extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
int ret = 0;
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- assert_spin_locked(&tree->lock);
rb_erase(&em->rb_node, &tree->map);
em->in_tree = 0;
return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..ab6d74b6e647 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
struct extent_map_tree {
struct rb_root map;
- spinlock_t lock;
+ rwlock_t lock;
};
static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273a..77f759302e12 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
int err = 0;
int i;
struct inode *inode = fdentry(file)->d_inode;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- u64 hint_byte;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ if (err)
+ return err;
- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
- trans = btrfs_join_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
- btrfs_set_trans_block_group(trans, inode);
- hint_byte = 0;
-
- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
- /* check for reserved extents on each page, we don't want
- * to reset the delalloc bit on things that already have
- * extents reserved.
- */
- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
- err = btrfs_end_transaction(trans, root);
-out_unlock:
- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
return err;
}
@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
if (!split2)
split2 = alloc_extent_map(GFP_NOFS);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
break;
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- spin_unlock(&em_tree->lock);
if (em->start <= start &&
(!testend || em->start + em->len >= start + len)) {
free_extent_map(em);
+ write_unlock(&em_tree->lock);
break;
}
if (start < em->start) {
@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
start = em->start + em->len;
}
free_extent_map(em);
+ write_unlock(&em_tree->lock);
continue;
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
split = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_byte)
+ u64 inline_limit, u64 *hint_byte, int drop_cache)
{
u64 extent_end = 0;
u64 search_start = start;
@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int ret;
inline_limit = 0;
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
@@ -894,7 +878,8 @@ again:
btrfs_put_ordered_extent(ordered);
clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, GFP_NOFS);
@@ -924,7 +909,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
unsigned long last_index;
int will_write;
- will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+ will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* do the reserve before the mutex lock in case we have to do some
+ * flushing. We wouldn't deadlock, but this is more polite.
+ */
+ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (err)
+ goto out_nolock;
+
+ mutex_lock(&inode->i_mutex);
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
- goto out_nolock;
+ goto out;
+
if (count == 0)
- goto out_nolock;
+ goto out;
err = file_remove_suid(file);
if (err)
- goto out_nolock;
+ goto out;
+
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
- mutex_lock(&inode->i_mutex);
+ /* generic_write_checks can change our pos */
+ start_pos = pos;
+
BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1024,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
if (will_write) {
- btrfs_fdatawrite_range(inode->i_mapping, pos,
- pos + write_bytes - 1,
- WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1);
} else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages);
@@ -1047,6 +1045,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
@@ -1077,7 +1076,7 @@ out_nolock:
if (err)
num_written = err;
- if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
trans = btrfs_start_transaction(root, 1);
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
@@ -1087,8 +1086,10 @@ out_nolock:
btrfs_end_transaction(trans, root);
else
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
}
}
if (file->f_flags & O_DIRECT) {
@@ -1138,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
int ret = 0;
struct btrfs_trans_handle *trans;
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
/*
* check the transaction that last modified this inode
* and see if its already been committed
@@ -1145,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (!BTRFS_I(inode)->last_trans)
goto out;
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
mutex_lock(&root->fs_info->trans_mutex);
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
@@ -1154,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
}
mutex_unlock(&root->fs_info->trans_mutex);
- root->log_batch++;
- filemap_fdatawrite(inode->i_mapping);
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- root->log_batch++;
-
- if (datasync && !(inode->i_state & I_DIRTY_PAGES))
- goto out;
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
@@ -1189,21 +1195,25 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
*/
mutex_unlock(&dentry->d_inode->i_mutex);
- if (ret > 0) {
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- ret = btrfs_end_transaction(trans, root);
- else
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
return ret > 0 ? EIO : ret;
}
-static struct vm_operations_struct btrfs_file_vm_ops = {
+static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
};
@@ -1215,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5edcee3a617f..cb2849f03251 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
{
- u64 max_bytes, possible_bytes;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
/*
* The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
max_bytes = MAX_CACHE_BYTES_PER_GIG *
(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
- possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
- (sizeof(struct btrfs_free_space) *
- block_group->extents_thresh);
+ /*
+ * we want to account for 1 more bitmap than what we have so we can make
+ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+ * we add more bitmaps.
+ */
+ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
- if (possible_bytes > max_bytes) {
- int extent_bytes = max_bytes -
- (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+ if (bitmap_bytes >= max_bytes) {
+ block_group->extents_thresh = 0;
+ return;
+ }
- if (extent_bytes <= 0) {
- block_group->extents_thresh = 0;
- return;
- }
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the maxw
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
- block_group->extents_thresh = extent_bytes /
- (sizeof(struct btrfs_free_space));
- }
+ block_group->extents_thresh =
+ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
}
static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
BUG_ON(block_group->total_bitmaps >= max_bitmaps);
info->offset = offset_to_bitmap(block_group, offset);
+ info->bytes = 0;
link_free_space(block_group, info);
block_group->total_bitmaps++;
@@ -1288,7 +1296,7 @@ again:
window_start = entry->offset;
window_free = entry->bytes;
last = entry;
- max_extent = 0;
+ max_extent = entry->bytes;
} else {
last = next;
window_free += next->bytes;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c611808..72ce3c173d6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
+ if (ret == -EOVERFLOW)
+ ret = -EMLINK;
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_inode_item));
- if (ret == 0 && objectid > root->highest_inode)
- root->highest_inode = objectid;
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123d..c56eb5909172 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = found_key.objectid;
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID;
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
@@ -53,91 +54,27 @@ error:
return ret;
}
-/*
- * walks the btree of allocated inodes and find a hole.
- */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 *objectid)
{
- struct btrfs_path *path;
- struct btrfs_key key;
int ret;
- int slot = 0;
- u64 last_ino = 0;
- int start_found;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- u64 search_start = dirid;
-
mutex_lock(&root->objectid_mutex);
- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
- *objectid = ++root->last_inode_alloc;
- mutex_unlock(&root->objectid_mutex);
- return 0;
- }
- path = btrfs_alloc_path();
- BUG_ON(!path);
- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
- search_key.objectid = search_start;
- search_key.type = 0;
- search_key.offset = 0;
-
- start_found = 0;
- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- if (!start_found) {
- *objectid = search_start;
- start_found = 1;
- goto found;
- }
- *objectid = last_ino > search_start ?
- last_ino : search_start;
- goto found;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
- if (key.objectid >= search_start) {
- if (start_found) {
- if (last_ino < search_start)
- last_ino = search_start;
- if (key.objectid > last_ino) {
- *objectid = last_ino;
- goto found;
- }
- } else if (key.objectid > search_start) {
- *objectid = search_start;
- goto found;
- }
- }
- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
- break;
+ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+ if (ret)
+ goto out;
+ }
- start_found = 1;
- last_ino = key.objectid + 1;
- path->slots[0]++;
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ ret = -ENOSPC;
+ goto out;
}
- BUG_ON(1);
-found:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
- BUG_ON(*objectid < search_start);
- mutex_unlock(&root->objectid_mutex);
- return 0;
-error:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
mutex_unlock(&root->objectid_mutex);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59cba180fe83..b3ad168a0bfc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,14 +55,14 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
-static struct inode_operations btrfs_dir_inode_operations;
-static struct inode_operations btrfs_symlink_inode_operations;
-static struct inode_operations btrfs_dir_ro_inode_operations;
-static struct inode_operations btrfs_special_inode_operations;
-static struct inode_operations btrfs_file_inode_operations;
-static struct address_space_operations btrfs_aops;
-static struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
}
ret = btrfs_drop_extents(trans, root, inode, start,
- aligned_end, aligned_end, start, &hint_byte);
+ aligned_end, aligned_end, start,
+ &hint_byte, 1);
BUG_ON(ret);
if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
- btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -423,9 +424,12 @@ again:
* and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 0,
- 0, 1, 1, 1);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
ret = 0;
goto free_pages_out;
}
@@ -534,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree;
- int ret;
+ int ret = 0;
if (list_empty(&async_cow->extents))
return 0;
@@ -548,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
io_tree = &BTRFS_I(inode)->io_tree;
+retry:
/* did the compression code fall back to uncompressed IO? */
if (!async_extent->pages) {
int page_started = 0;
@@ -558,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->ram_size - 1, GFP_NOFS);
/* allocate blocks */
- cow_file_range(inode, async_cow->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ ret = cow_file_range(inode, async_cow->locked_page,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0);
/*
* if page_started, cow_file_range inserted an
@@ -570,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
* and IO for us. Otherwise, we need to submit
* all those pages down to the drive.
*/
- if (!page_started)
+ if (!page_started && !ret)
extent_write_locked_range(io_tree,
inode, async_extent->start,
async_extent->start +
@@ -598,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->compressed_size,
0, alloc_hint,
(u64)-1, &ins, 1);
- BUG_ON(ret);
+ if (ret) {
+ int i;
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
+ goto retry;
+ }
+
em = alloc_extent_map(GFP_NOFS);
em->start = async_extent->start;
em->len = async_extent->ram_size;
@@ -611,9 +630,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -636,11 +655,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
* clear dirty, set writeback and unlock the pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, 1, 1, 0, 1, 1, 0);
+ &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
ret = btrfs_submit_compressed_write(inode,
async_extent->start,
@@ -711,9 +733,15 @@ static noinline int cow_file_range(struct inode *inode,
start, end, 0, NULL);
if (ret == 0) {
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 1,
- 1, 1, 1, 1);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
@@ -725,9 +753,34 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+ read_lock(&BTRFS_I(inode)->extent_tree.lock);
+ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+ start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&BTRFS_I(inode)->extent_tree.lock);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
+ unsigned long op;
+
cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
@@ -737,7 +790,6 @@ static noinline int cow_file_range(struct inode *inode,
em = alloc_extent_map(GFP_NOFS);
em->start = start;
em->orig_start = em->start;
-
ram_size = ins.offset;
em->len = ins.offset;
@@ -747,9 +799,9 @@ static noinline int cow_file_range(struct inode *inode,
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -776,11 +828,17 @@ static noinline int cow_file_range(struct inode *inode,
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
+ *
+ * Do set the Private2 bit so we know this page was properly
+ * setup for writepage
*/
+ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2;
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
start, start + ram_size - 1,
- locked_page, unlock, 1,
- 1, 0, 0, 0);
+ locked_page, op);
disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
@@ -852,8 +910,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
- EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+ 1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
async_cow->inode = inode;
@@ -994,6 +1052,7 @@ next_slot:
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
+ extent_type = 0;
goto out_check;
}
@@ -1080,9 +1139,9 @@ out_check:
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -1100,8 +1159,10 @@ out_check:
BUG_ON(ret);
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
- cur_offset, cur_offset + num_bytes - 1,
- locked_page, 1, 1, 1, 0, 0, 0);
+ cur_offset, cur_offset + num_bytes - 1,
+ locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1147,6 +1208,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
+static int btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 size;
+
+ if (!(orig->state & EXTENT_DELALLOC))
+ return 0;
+
+ size = orig->end - orig->start + 1;
+ if (size > root->fs_info->max_extent) {
+ u64 num_extents;
+ u64 new_size;
+
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+
+ /*
+ * if we break a large extent up then leave oustanding_extents
+ * be, since we've already accounted for the large extent.
+ */
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) < num_extents)
+ return 0;
+ }
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return 0;
+
+ old_size = other->end - other->start + 1;
+ if (new->start < other->start)
+ new_size = other->end - new->start + 1;
+ else
+ new_size = new->end - other->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= root->fs_info->max_extent) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ return 0;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) > num_extents)
+ return 0;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
@@ -1155,6 +1299,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
{
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
@@ -1162,6 +1307,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
*/
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1178,22 +1327,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ if (bits & EXTENT_DO_ACCOUNTING) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ }
+
spin_lock(&root->fs_info->delalloc_lock);
- if (end - start + 1 > root->fs_info->delalloc_bytes) {
+ if (state->end - state->start + 1 >
+ root->fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs warning: delalloc account "
"%llu %llu\n",
- (unsigned long long)end - start + 1,
+ (unsigned long long)
+ state->end - state->start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1201,9 +1359,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
btrfs_delalloc_free_space(root, inode,
- end - start + 1);
- root->fs_info->delalloc_bytes -= end - start + 1;
- BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+ state->end -
+ state->start + 1);
+ root->fs_info->delalloc_bytes -= state->end -
+ state->start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= state->end -
+ state->start + 1;
}
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -1374,10 +1535,8 @@ again:
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
/* already ordered? We're done */
- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_ORDERED, 0)) {
+ if (PagePrivate2(page))
goto out;
- }
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
@@ -1413,11 +1572,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
struct inode *inode = page->mapping->host;
struct btrfs_writepage_fixup *fixup;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_ORDERED, 0);
- if (ret)
+ /* this page is properly in the ordered list */
+ if (TestClearPagePrivate2(page))
return 0;
if (PageChecked(page))
@@ -1455,9 +1612,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(!path);
path->leave_spinning = 1;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, locked_end,
- file_pos, &hint);
+ file_pos, &hint, 0);
BUG_ON(ret);
ins.objectid = inode->i_ino;
@@ -1485,7 +1652,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
@@ -1596,6 +1762,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset,
+ ordered_extent->len);
BUG_ON(ret);
}
unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1792,7 @@ nocow:
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ ClearPagePrivate2(page);
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}
@@ -1669,13 +1839,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->last_mirror = 0;
failrec->bio_flags = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (em->start > start || em->start + em->len < start) {
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || IS_ERR(em)) {
kfree(failrec);
@@ -1794,7 +1964,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
GFP_NOFS);
return 0;
@@ -2333,7 +2503,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
root = BTRFS_I(dir)->root;
+ /*
+ * 5 items for unlink inode
+ * 1 for orphan
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 6);
+ return PTR_ERR(trans);
+ }
btrfs_set_trans_block_group(trans, dir);
@@ -2348,10 +2530,74 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir->i_ino, &index, name, name_len);
+ if (ret < 0) {
+ BUG_ON(ret != -ENOENT);
+ di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ name, name_len);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ index = key.offset;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+ dir->i_sb->s_dirt = 1;
+
+ btrfs_free_path(path);
+ return 0;
+}
+
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -2361,31 +2607,43 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
- /*
- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
- * the root of a subvolume or snapshot
- */
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- }
+
+ ret = btrfs_reserve_metadata_space(root, 5);
+ if (ret)
+ return ret;
trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 5);
+ return PTR_ERR(trans);
+ }
+
btrfs_set_trans_block_group(trans, dir);
+ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, root, dir,
+ BTRFS_I(inode)->location.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ goto out;
+ }
+
err = btrfs_orphan_add(trans, inode);
if (err)
- goto fail_trans;
+ goto out;
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 5);
btrfs_btree_balance_dirty(root, nr);
if (ret && !err)
@@ -2826,12 +3084,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret)
+ goto out;
ret = -ENOMEM;
again:
page = grab_cache_page(mapping, index);
- if (!page)
+ if (!page) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
goto out;
+ }
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -2864,7 +3132,16 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ goto out_unlock;
+ }
+
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
kaddr = kmap(page);
@@ -2877,6 +3154,9 @@ again:
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ if (ret)
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
unlock_page(page);
page_cache_release(page);
out:
@@ -2895,17 +3175,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err;
+ int err = 0;
if (size <= hole_start)
return 0;
- err = btrfs_check_metadata_free_space(root);
+ err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (err)
return err;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
-
while (1) {
struct btrfs_ordered_extent *ordered;
btrfs_wait_ordered_range(inode, hole_start,
@@ -2935,15 +3213,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
cur_offset,
cur_offset + hole_size,
block_end,
- cur_offset, &hint_byte);
+ cur_offset, &hint_byte, 1);
if (err)
break;
+
+ err = btrfs_reserve_metadata_space(root, 1);
+ if (err)
+ break;
+
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
+ btrfs_unreserve_metadata_space(root, 1);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3003,6 +3287,11 @@ void btrfs_delete_inode(struct inode *inode)
}
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+ goto no_delete;
+ }
+
btrfs_i_size_write(inode, 0);
trans = btrfs_join_transaction(root, 1);
@@ -3070,29 +3359,67 @@ out_err:
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_root *root,
- struct btrfs_key *location,
- struct btrfs_root **sub_root,
- struct dentry *dentry)
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
{
- struct btrfs_root_item *ri;
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ int ret;
+ int err = 0;
- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
- return 0;
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return 0;
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
- *sub_root = btrfs_read_fs_root(root->fs_info, location,
- dentry->d_name.name,
- dentry->d_name.len);
- if (IS_ERR(*sub_root))
- return PTR_ERR(*sub_root);
+ err = -ENOENT;
+ ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+ BTRFS_I(dir)->root->root_key.objectid,
+ location->objectid);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
- ri = &(*sub_root)->root_item;
- location->objectid = btrfs_root_dirid(ri);
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
- location->offset = 0;
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ goto out;
- return 0;
+ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+ (unsigned long)(ref + 1),
+ dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ return err;
}
static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3428,13 @@ static void inode_tree_add(struct inode *inode)
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
-
again:
p = &root->inode_tree.rb_node;
parent = NULL;
+ if (hlist_unhashed(&inode->i_hash))
+ return;
+
spin_lock(&root->inode_lock);
while (*p) {
parent = *p;
@@ -3132,13 +3461,87 @@ again:
static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int empty = 0;
spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ }
+ spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ synchronize_srcu(&root->fs_info->subvol_srcu);
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
+ }
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = entry->vfs_inode.i_ino + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will remove it from
+ * the inode cache when its usage count
+ * hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
}
spin_unlock(&root->inode_lock);
+ return 0;
}
static noinline void init_btrfs_i(struct inode *inode)
@@ -3148,6 +3551,7 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->generation = 0;
bi->sequence = 0;
bi->last_trans = 0;
+ bi->last_sub_trans = 0;
bi->logged_trans = 0;
bi->delalloc_bytes = 0;
bi->reserved_bytes = 0;
@@ -3225,15 +3629,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ init_btrfs_i(inode);
+
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->dummy_inode = 1;
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ return inode;
+}
+
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
- struct btrfs_inode *bi = BTRFS_I(dir);
- struct btrfs_root *root = bi->root;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
+ int index;
int ret;
+ dentry->d_op = &btrfs_dentry_operations;
+
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -3242,29 +3672,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
- inode = NULL;
- if (location.objectid) {
- ret = fixup_tree_root_location(root, &location, &sub_root,
- dentry);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
+ if (location.objectid == 0)
+ return NULL;
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, &location, root);
+ return inode;
+ }
+
+ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+ index = srcu_read_lock(&root->fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(root, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ } else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
+ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
return inode;
}
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+ struct btrfs_root *root;
+
+ if (!dentry->d_inode && !IS_ROOT(dentry))
+ dentry = dentry->d_parent;
+
+ if (dentry->d_inode) {
+ root = BTRFS_I(dentry->d_inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ }
+ return 0;
+}
+
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode;
- if (dentry->d_name.len > BTRFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3603,9 +4056,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail;
- if (objectid > root->highest_inode)
- root->highest_inode = objectid;
-
inode->i_uid = current_fsuid();
if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +4123,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
- int ret;
+ int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
- key.objectid = inode->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+ } else {
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ }
+
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_inode->i_ino,
+ index, name, name_len);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root,
+ name, name_len, inode->i_ino,
+ parent_inode->i_ino, index);
+ }
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode->i_ino,
- &key, btrfs_inode_type(inode),
- index);
if (ret == 0) {
- if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root,
- name, name_len,
- inode->i_ino,
- parent_inode->i_ino,
- index);
- }
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode->i_ino, &key,
+ btrfs_inode_type(inode), index);
+ BUG_ON(ret);
+
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3732,11 +4191,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3774,6 +4240,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3794,10 +4261,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
+
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3838,6 +4313,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3860,10 +4336,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink == 0)
return -ENOENT;
- btrfs_inc_nlink(inode);
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ err = btrfs_reserve_metadata_space(root, 3);
if (err)
- goto fail;
+ return err;
+
+ btrfs_inc_nlink(inode);
+
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
@@ -3875,20 +4357,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_add_nondir(trans, dentry, inode, 1, index);
- if (err)
- drop_inode = 1;
-
- btrfs_update_inode_block_group(trans, dir);
- err = btrfs_update_inode(trans, root, inode);
-
- if (err)
+ if (err) {
drop_inode = 1;
+ } else {
+ btrfs_update_inode_block_group(trans, dir);
+ err = btrfs_update_inode(trans, root, inode);
+ BUG_ON(err);
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ }
nr = trans->blocks_used;
-
- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3908,17 +4389,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_unlock;
+ return err;
trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, dir);
-
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (!trans) {
+ err = -ENOMEM;
goto out_unlock;
}
+ btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
@@ -3967,6 +4452,7 @@ out_fail:
btrfs_end_transaction_throttle(trans, root);
out_unlock:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4064,11 +4550,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int compressed;
again:
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
em->bdev = root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4701,11 @@ again:
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
+ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_CACHE_SIZE - pg_offset -
+ copy_size);
+ }
kunmap(page);
}
flush_dcache_page(page);
@@ -4259,7 +4750,7 @@ insert:
}
err = 0;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
@@ -4299,7 +4790,7 @@ insert:
err = 0;
}
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
out:
if (path)
btrfs_free_path(path);
@@ -4398,13 +4889,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ /*
+ * we have the page locked, so new writeback can't start,
+ * and the dirty bit won't be cleared while we are here.
+ *
+ * Wait for IO on this page so that we can safely clear
+ * the PagePrivate2 bit and do ordered accounting
+ */
wait_on_page_writeback(page);
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
-
lock_extent(tree, page_start, page_end, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
@@ -4415,16 +4914,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED, 1, 0, GFP_NOFS);
- btrfs_finish_ordered_io(page->mapping->host,
- page_start, page_end);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+ NULL, GFP_NOFS);
+ /*
+ * whoever cleared the private bit is responsible
+ * for the finish_ordered_io
+ */
+ if (TestClearPagePrivate2(page)) {
+ btrfs_finish_ordered_io(page->mapping->host,
+ page_start, page_end);
+ }
btrfs_put_ordered_extent(ordered);
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_ORDERED,
- 1, 1, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -4473,6 +4978,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -4504,7 +5016,24 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ /*
+ * XXX - page_mkwrite gets called every time the page is dirtied, even
+ * if it was already dirty, so for space accounting reasons we need to
+ * clear any delalloc bits for the range we are fixing to save. There
+ * is probably a better way to do this, but for now keep consistent with
+ * prepare_pages in the normal write path.
+ */
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ goto out_unlock;
+ }
ret = 0;
/* page is wholly or partially inside EOF */
@@ -4521,11 +5050,17 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+ SetPageUptodate(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (!ret)
+ return VM_FAULT_LOCKED;
unlock_page(page);
out:
return ret;
@@ -4544,7 +5079,9 @@ static void btrfs_truncate(struct inode *inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (ret)
+ return;
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
@@ -4594,11 +5131,11 @@ out:
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint)
{
struct inode *inode;
- int error;
+ int err;
u64 index = 0;
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +5148,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_nlink = 1;
btrfs_i_size_write(inode, 0);
- error = btrfs_update_inode(trans, new_root, inode);
- if (error)
- return error;
+ err = btrfs_update_inode(trans, new_root, inode);
+ BUG_ON(err);
- d_instantiate(dentry, inode);
+ iput(inode);
return 0;
}
@@ -4640,7 +5176,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
ei->last_trans = 0;
+ ei->last_sub_trans = 0;
ei->logged_trans = 0;
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+ ei->root = NULL;
+ spin_lock_init(&ei->accounting_lock);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4656,6 +5197,14 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(inode->i_data.nrpages);
/*
+ * This can happen where we create an inode, but somebody else also
+ * created the same inode and we need to destroy the one we already
+ * created.
+ */
+ if (!root)
+ goto free;
+
+ /*
* Make sure we're properly removed from the ordered operation
* lists.
*/
@@ -4690,9 +5239,20 @@ void btrfs_destroy_inode(struct inode *inode)
}
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
+void btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+ generic_delete_inode(inode);
+ else
+ generic_drop_inode(inode);
+}
+
static void init_once(void *foo)
{
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5321,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
+ u64 root_objectid;
int ret;
- /* we're not allowed to rename between subvolumes */
- if (BTRFS_I(old_inode)->root->root_key.objectid !=
- BTRFS_I(new_dir)->root->root_key.objectid)
+ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
+ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
if (S_ISDIR(old_inode->i_mode) && new_inode &&
- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- }
- /* to rename a snapshot or subvolume, we need to juggle the
- * backrefs. This isn't coded yet
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
*/
- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return -EXDEV;
-
- ret = btrfs_check_metadata_free_space(root);
+ ret = btrfs_reserve_metadata_space(root, 11);
if (ret)
- goto out_unlock;
+ return ret;
/*
* we're using rename to replace one file with another.
@@ -4796,8 +5365,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping);
+ /* close the racy window with snapshot create/destroy ioctl */
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+
trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_inode->i_ino,
+ new_dir->i_ino, index);
+ if (ret)
+ goto out_fail;
+ /*
+ * this is an ugly little race, but the rename is required
+ * to make sure that if we crash, the inode is either at the
+ * old name or the new one. pinning the log transaction lets
+ * us make sure we don't allow a log commit to come in after
+ * we unlink the name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+ }
/*
* make sure the inode gets flushed if it is replacing
* something.
@@ -4807,18 +5408,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_add_ordered_operation(trans, root, old_inode);
}
- /*
- * this is an ugly little race, but the rename is required to make
- * sure that if we crash, the inode is either at the old name
- * or the new one. pinning the log transaction lets us make sure
- * we don't allow a log commit to come in after we unlink the
- * name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
-
- btrfs_set_trans_block_group(trans, new_dir);
-
- btrfs_inc_nlink(old_dentry->d_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
@@ -4826,47 +5415,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else {
+ btrfs_inc_nlink(old_dentry->d_inode);
+ ret = btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_unlink_inode(trans, root, new_dir,
- new_dentry->d_inode,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(new_inode->i_ino ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ root_objectid = BTRFS_I(new_inode)->location.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
- if (ret)
- goto out_fail;
+ BUG_ON(ret);
}
-
}
- ret = btrfs_set_inode_index(new_dir, &index);
- if (ret)
- goto out_fail;
- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
- old_inode, new_dentry->d_name.name,
- new_dentry->d_name.len, 1, index);
- if (ret)
- goto out_fail;
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, index);
+ BUG_ON(ret);
- btrfs_log_new_name(trans, old_inode, old_dir,
- new_dentry->d_parent);
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
+ btrfs_end_log_trans(root);
+ }
out_fail:
-
- /* this btrfs_end_log_trans just allows the current
- * log-sub transaction to complete
- */
- btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ btrfs_unreserve_metadata_space(root, 11);
return ret;
}
@@ -4938,11 +5540,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto out_fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5023,6 +5632,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
out_fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5044,6 +5654,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ goto out;
+
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
@@ -5058,9 +5673,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset -1, 0);
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
+ btrfs_unreserve_metadata_space(root, 1);
}
out:
if (cur_offset > start) {
@@ -5201,7 +5819,7 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask, btrfs_check_acl);
}
-static struct inode_operations btrfs_dir_inode_operations = {
+static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
.create = btrfs_create,
@@ -5219,11 +5837,12 @@ static struct inode_operations btrfs_dir_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
};
-static struct inode_operations btrfs_dir_ro_inode_operations = {
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
-static struct file_operations btrfs_dir_file_operations = {
+
+static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = btrfs_real_readdir,
@@ -5245,6 +5864,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
};
/*
@@ -5259,7 +5880,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
*
* For now we're avoiding this by dropping bmap.
*/
-static struct address_space_operations btrfs_aops = {
+static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
@@ -5269,16 +5890,17 @@ static struct address_space_operations btrfs_aops = {
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
+ .error_remove_page = generic_error_remove_page,
};
-static struct address_space_operations btrfs_symlink_aops = {
+static const struct address_space_operations btrfs_symlink_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
};
-static struct inode_operations btrfs_file_inode_operations = {
+static const struct inode_operations btrfs_file_inode_operations = {
.truncate = btrfs_truncate,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
@@ -5290,7 +5912,7 @@ static struct inode_operations btrfs_file_inode_operations = {
.fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
-static struct inode_operations btrfs_special_inode_operations = {
+static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
@@ -5299,7 +5921,7 @@ static struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
-static struct inode_operations btrfs_symlink_inode_operations = {
+static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
@@ -5309,3 +5931,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
+
+const struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f7..cdbb054102b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- struct btrfs_root *new_root = root;
- struct inode *dir;
+ struct btrfs_root *new_root;
+ struct inode *dir = dentry->d_parent->d_inode;
int ret;
int err;
u64 objectid;
@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
- goto fail_commit;
+ return ret;
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
if (ret)
goto fail;
+ key.offset = (u64)-1;
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(new_root));
+
+ btrfs_record_root_in_trans(trans, new_root);
+
+ ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+ BTRFS_I(dir)->block_group);
/*
* insert the directory item
*/
- key.offset = (u64)-1;
- dir = dentry->d_parent->d_inode;
ret = btrfs_set_inode_index(dir, &index);
BUG_ON(ret);
@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- objectid, BTRFS_ROOT_BACKREF_KEY,
- root->root_key.objectid,
+ objectid, root->root_key.objectid,
dir->i_ino, index, name, namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- root->root_key.objectid, BTRFS_ROOT_REF_KEY,
- objectid,
- dir->i_ino, index, name, namelen);
-
- BUG_ON(ret);
-
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- goto fail_commit;
-
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(!new_root);
-
- trans = btrfs_start_transaction(new_root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
- BTRFS_I(dir)->block_group);
- if (ret)
- goto fail;
-
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
nr = trans->blocks_used;
- err = btrfs_commit_transaction(trans, new_root);
+ err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-fail_commit:
+
+ btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(struct path *parent, char *name,
- int mode, int namelen,
+static noinline int btrfs_mksubvol(struct path *parent,
+ char *name, int namelen,
struct btrfs_root *snap_src)
{
+ struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
int error;
- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
if (dentry->d_inode)
goto out_dput;
- if (!IS_POSIXACL(parent->dentry->d_inode))
- mode &= ~current_umask();
-
error = mnt_want_write(parent->mnt);
if (error)
goto out_dput;
- error = btrfs_may_create(parent->dentry->d_inode, dentry);
+ error = btrfs_may_create(dir, dentry);
if (error)
goto out_drop_write;
- /*
- * Actually perform the low-level subvolume creation after all
- * this VFS fuzz.
- *
- * Eventually we want to pass in an inode under which we create this
- * subvolume, but for now all are under the filesystem root.
- *
- * Also we should pass on the mode eventually to allow creating new
- * subvolume with specific mode bits.
- */
+ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
if (snap_src) {
- struct dentry *dir = dentry->d_parent;
- struct dentry *test = dir->d_parent;
- struct btrfs_path *path = btrfs_alloc_path();
- int ret;
- u64 test_oid;
- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
-
- test_oid = snap_src->root_key.objectid;
-
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, parent_oid, test_oid);
- if (ret == 0)
- goto create;
- btrfs_release_path(snap_src->fs_info->tree_root, path);
-
- /* we need to make sure we aren't creating a directory loop
- * by taking a snapshot of something that has our current
- * subvol in its directory tree. So, this loops through
- * the dentries and checks the forward refs for each subvolume
- * to see if is references the subvolume where we are
- * placing this new snapshot.
- */
- while (1) {
- if (!test ||
- dir == snap_src->fs_info->sb->s_root ||
- test == snap_src->fs_info->sb->s_root ||
- test->d_inode->i_sb != snap_src->fs_info->sb) {
- break;
- }
- if (S_ISLNK(test->d_inode->i_mode)) {
- printk(KERN_INFO "Btrfs symlink in snapshot "
- "path, failed\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- test_oid =
- BTRFS_I(test->d_inode)->root->root_key.objectid;
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, test_oid, parent_oid);
- if (ret == 0) {
- printk(KERN_INFO "Btrfs snapshot creation "
- "failed, looping\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- btrfs_release_path(snap_src->fs_info->tree_root, path);
- test = test->d_parent;
- }
-create:
- btrfs_free_path(path);
- error = create_snapshot(snap_src, dentry, name, namelen);
+ error = create_snapshot(snap_src, dentry,
+ name, namelen);
} else {
- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
- dentry, name, namelen);
+ error = create_subvol(BTRFS_I(dir)->root, dentry,
+ name, namelen);
}
- if (error)
- goto out_drop_write;
-
- fsnotify_mkdir(parent->dentry->d_inode, dentry);
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
out_drop_write:
mnt_drop_write(parent->mnt);
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ mutex_unlock(&dir->i_mutex);
return error;
}
-
static int btrfs_defrag_file(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +534,8 @@ again:
clear_page_dirty_for_io(page);
btrfs_set_extent_delalloc(inode, page_start, page_end);
-
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +546,8 @@ out_unlock:
return 0;
}
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ void __user *arg)
{
u64 new_size;
u64 old_size;
@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
struct file *src_file;
- u64 root_dirid;
int namelen;
int ret = 0;
@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
- path, root_dirid,
- vol_args->name, namelen, 0);
- btrfs_free_path(path);
-
- if (di && !IS_ERR(di)) {
- ret = -EEXIST;
- goto out;
- }
-
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
-
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, NULL);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ NULL);
} else {
struct inode *src_inode;
src_file = fget(vol_args->fd);
@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
fput(src_file);
goto out;
}
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, BTRFS_I(src_inode)->root);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ BTRFS_I(src_inode)->root);
fput(src_file);
}
-
out:
kfree(vol_args);
return ret;
}
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg)
+{
+ struct dentry *parent = fdentry(file);
+ struct dentry *dentry;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ int namelen;
+ int ret;
+ int err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/') ||
+ strncmp(vol_args->name, "..", namelen) == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ goto out;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(vol_args->name, parent, namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (!dentry->d_inode) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = dentry->d_inode;
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ dest = BTRFS_I(inode)->root;
+
+ mutex_lock(&inode->i_mutex);
+ err = d_invalidate(dentry);
+ if (err)
+ goto out_unlock;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ BUG_ON(ret);
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ inode->i_flags |= S_DEAD;
+out_up_write:
+ up_write(&root->fs_info->subvol_sem);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ if (!err) {
+ shrink_dcache_sb(root->fs_info->sb);
+ btrfs_invalidate_inodes(dest);
+ d_delete(dentry);
+ }
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(file->f_path.mnt);
+out:
+ kfree(vol_args);
+ return err;
+}
+
static int btrfs_ioctl_defrag(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret;
}
-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* punch hole in destination first */
btrfs_drop_extents(trans, root, inode, off, off + len,
- off + len, 0, &hint_byte);
+ off + len, 0, &hint_byte, 1);
/* clone data */
key.objectid = src->i_ino;
@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datao += off - key.offset;
datal -= off - key.offset;
}
- if (key.offset + datao + datal + key.offset >
- off + len)
- datal = off + len - key.offset - datao;
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
+ int ret;
+ ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ goto out;
- if (file->private_data) {
- ret = -EINPROGRESS;
+ ret = -EINPROGRESS;
+ if (file->private_data)
goto out;
- }
ret = mnt_want_write(file->f_path.mnt);
if (ret)
@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
root->fs_info->open_ioctl_trans++;
mutex_unlock(&root->fs_info->trans_mutex);
+ ret = -ENOMEM;
trans = btrfs_start_ioctl_transaction(root, 0);
- if (trans)
- file->private_data = trans;
- else
- ret = -ENOMEM;
- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+ if (!trans)
+ goto out_drop;
+
+ file->private_data = trans;
+ return 0;
+
+out_drop:
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ mnt_drop_write(file->f_path.mnt);
out:
return ret;
}
@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
trans = file->private_data;
- if (!trans) {
- ret = -EINVAL;
- goto out;
- }
- btrfs_end_transaction(trans, root);
+ if (!trans)
+ return -EINVAL;
file->private_data = NULL;
+ btrfs_end_transaction(trans, root);
+
mutex_lock(&root->fs_info->trans_mutex);
root->fs_info->open_ioctl_trans--;
mutex_unlock(&root->fs_info->trans_mutex);
mnt_drop_write(file->f_path.mnt);
-
-out:
- return ret;
+ return 0;
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_DEFRAG:
return btrfs_ioctl_defrag(file);
case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa13..bc49914475eb 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
struct btrfs_ioctl_vol_args)
-
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+ struct btrfs_ioctl_vol_args)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..5799bc46a309 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*
* len is the length of the extent
*
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->start = start;
entry->len = len;
entry->disk_len = disk_len;
+ entry->bytes_left = len;
entry->inode = inode;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&entry->rb_node);
BUG_ON(node);
- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
- entry_end(entry) - 1, GFP_NOFS);
-
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
- GFP_NOFS);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
goto out;
}
- ret = test_range_bit(io_tree, entry->file_offset,
- entry->file_offset + entry->len - 1,
- EXTENT_ORDERED, 0);
- if (ret == 0)
+ if (io_size > entry->bytes_left) {
+ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ (unsigned long long)entry->bytes_left,
+ (unsigned long long)io_size);
+ }
+ entry->bytes_left -= io_size;
+ if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ else
+ ret = 1;
out:
mutex_unlock(&tree->mutex);
return ret == 0;
@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+ inode, 1);
+
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -460,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 orig_end;
u64 wait_end;
struct btrfs_ordered_extent *ordered;
+ int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
@@ -489,19 +494,18 @@ again:
/* start IO across the range first to instantiate any delalloc
* extents
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
/* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again
* with WB_SYNC_ALL will end up waiting for the IO to actually start.
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
- btrfs_wait_on_page_writeback_range(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT,
- orig_end >> PAGE_CACHE_SHIFT);
+ filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
+ found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
@@ -514,6 +518,7 @@ again:
btrfs_put_ordered_extent(ordered);
break;
}
+ found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
@@ -521,8 +526,8 @@ again:
break;
end--;
}
- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+ EXTENT_DELALLOC, 0, NULL)) {
schedule_timeout(1);
goto again;
}
@@ -613,7 +618,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (test_range_bit(io_tree, disk_i_size,
ordered->file_offset + ordered->len - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
goto out;
}
/*
@@ -664,7 +669,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (i_size_test > entry_end(ordered) &&
!test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
new_i_size = min_t(u64, i_size_test, i_size_read(inode));
}
BTRFS_I(inode)->disk_i_size = new_i_size;
@@ -715,90 +720,6 @@ out:
}
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * @mapping: address space structure to write
- * @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends (inclusive)
- * @sync_mode: enable synchronous operation
- *
- * Start writeback against all of a mapping's dirty pages that lie
- * within the byte offsets <start, end> inclusive.
- *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback. The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
- */
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode)
-{
- struct writeback_control wbc = {
- .sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = start,
- .range_end = end,
- .for_writepages = 1,
- };
- return btrfs_writepages(mapping, &wbc);
-}
-
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping: target address_space
- * @start: beginning page index
- * @end: ending page index
- *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
- */
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
-{
- struct pagevec pvec;
- int nr_pages;
- int ret = 0;
- pgoff_t index;
-
- if (end < start)
- return 0;
-
- pagevec_init(&pvec, 0);
- index = start;
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
- unsigned i;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
- wait_on_page_writeback(page);
- if (PageError(page))
- ret = -EIO;
- }
- pagevec_release(&pvec);
- cond_resched();
- }
-
- /* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
-
- return ret;
-}
-
/*
* add a given inode to the list of inodes that must be fully on
* disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
/* extent length on disk */
u64 disk_len;
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
/* flags (described above) */
unsigned long flags;
@@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
int btrfs_ordered_update_i_size(struct inode *inode,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end);
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f80..79cba5fbc28e 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f212602..cfcc93c93a7b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
int nr;
};
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
struct reloc_control {
/* block group to relocate */
struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
struct reloc_control *rc)
{
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
return 0;
}
@@ -2529,56 +2538,94 @@ out:
}
static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+ u64 block_start)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 page_start;
u64 page_end;
- unsigned long i;
- unsigned long first_index;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
unsigned long last_index;
- unsigned int total_read = 0;
- unsigned int total_dirty = 0;
+ unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
- struct btrfs_ordered_extent *ordered;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int nr = 0;
int ret = 0;
+ if (!cluster->nr)
+ return 0;
+
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
mutex_lock(&inode->i_mutex);
- first_index = start >> PAGE_CACHE_SHIFT;
- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
- /* make sure the dirty trick played by the caller work */
- while (1) {
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- first_index, last_index);
- if (ret != -EBUSY)
- break;
- schedule_timeout(HZ/10);
- }
+ i_size_write(inode, cluster->end + 1 - offset);
+ ret = setup_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
if (ret)
goto out_unlock;
file_ra_state_init(ra, inode->i_mapping);
- for (i = first_index ; i <= last_index; i++) {
- if (total_read % ra->ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, ra, NULL, i,
- min(last_index, ra->ra_pages + i - 1));
- }
- total_read++;
-again:
- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
- BUG_ON(1);
- page = grab_cache_page(inode->i_mapping, i);
+ WARN_ON(cluster->start != cluster->boundary[0]);
+ while (index <= last_index) {
+ page = find_lock_page(inode->i_mapping, index);
if (!page) {
- ret = -ENOMEM;
- goto out_unlock;
+ page_cache_sync_readahead(inode->i_mapping,
+ ra, NULL, index,
+ last_index + 1 - index);
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping,
+ ra, NULL, page, index,
+ last_index + 1 - index);
}
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
@@ -2589,75 +2636,79 @@ again:
goto out_unlock;
}
}
- wait_on_page_writeback(page);
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
+
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
+
set_page_extent_mapped(page);
- if (i == first_index)
- set_extent_bits(io_tree, page_start, page_end,
+ if (nr < cluster->nr &&
+ page_start + offset == cluster->boundary[nr]) {
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
EXTENT_BOUNDARY, GFP_NOFS);
+ nr++;
+ }
btrfs_set_extent_delalloc(inode, page_start, page_end);
set_page_dirty(page);
- total_dirty++;
+ dirty_page++;
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
+
+ index++;
+ if (nr < cluster->nr &&
+ page_end + 1 + offset == cluster->boundary[nr]) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
+ dirty_page = 0;
+ }
+ }
+ if (dirty_page) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
}
+ WARN_ON(nr != cluster->nr);
out_unlock:
mutex_unlock(&inode->i_mutex);
kfree(ra);
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
return ret;
}
static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_map *em;
- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
- u64 end = start + extent_key->offset - 1;
-
- em = alloc_extent_map(GFP_NOFS);
- em->start = start;
- em->len = extent_key->offset;
- em->block_len = extent_key->offset;
- em->block_start = extent_key->objectid;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ int ret;
- /* setup extent map to cheat btrfs_readpage */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- while (1) {
- int ret;
- spin_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(inode, start, end, 0);
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- return relocate_inode_pages(inode, start, extent_key->offset);
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
}
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
return 0;
}
+
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
+ struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
+ cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+ if (!cluster)
+ return -ENOMEM;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
rc->search_start = rc->block_group->key.objectid;
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_end_transaction(trans, rc->extent_root);
trans = NULL;
btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
- ret = relocate_data_extent(rc->data_inode, &key);
+ ret = relocate_data_extent(rc->data_inode,
+ &key, cluster);
if (ret < 0) {
err = ret;
break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ kfree(cluster);
+
rc->create_reloc_root = 0;
smp_mb();
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
+ btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (err)
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
+ err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
key.objectid = objectid;
@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
BUG_ON(!rc->block_group);
btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size, NULL);
rc->extent_root = extent_root;
btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(fs_info->tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
-
rc->extents_found = 0;
rc->extents_skipped = 0;
+ mutex_lock(&fs_info->cleaner_mutex);
+
+ btrfs_clean_old_snapshots(fs_info->tree_root);
ret = relocate_block_group(rc);
+
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
}
}
- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
- rc->block_group->key.objectid,
- rc->block_group->key.objectid +
- rc->block_group->key.offset - 1);
+ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+ rc->block_group->key.objectid,
+ rc->block_group->key.objectid +
+ rc->block_group->key.offset - 1);
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
return err;
}
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ root->root_item.drop_level = 0;
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ BUG_ON(ret);
+
+ ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ BUG_ON(ret);
+ return 0;
+}
+
/*
* recover relocation interrupted by system crash.
*
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
- goto out;
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ mark_garbage_root(reloc_root);
}
}
@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
mapping_tree_init(&rc->reloc_root_tree);
INIT_LIST_HEAD(&rc->reloc_roots);
btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size);
+ root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55a..67fa2d29d663 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
goto out;
BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = 1;
+ goto out;
+ }
l = path->nodes[0];
- BUG_ON(path->slots[0] == 0);
slot = path->slots[0] - 1;
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid) {
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
- sizeof(*item));
- memcpy(key, &found_key, sizeof(found_key));
+ if (item)
+ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+ sizeof(*item));
+ if (key)
+ memcpy(key, &found_key, sizeof(found_key));
ret = 0;
out:
btrfs_free_path(path);
@@ -153,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
write_extent_buffer(l, item, ptr, sizeof(*item));
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
return ret;
}
@@ -249,6 +254,59 @@ err:
return ret;
}
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(tree_root, path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_find_dead_roots(tree_root, key.offset);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ key.offset++;
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
/* drop the root item for 'key' from 'root' */
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key)
@@ -273,36 +331,61 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
BUG_ON(refs != 0);
ret = btrfs_del_item(trans, root, path);
out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
return ret;
}
-#if 0 /* this will get used when snapshot deletion is implemented */
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id)
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len)
+
{
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
struct btrfs_key key;
+ unsigned long ptr;
+ int err = 0;
int ret;
- struct btrfs_path *path;
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret);
-
- ret = btrfs_del_item(trans, tree_root, path);
- BUG_ON(ret);
+ BUG_ON(ret < 0);
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+
+ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ BUG_ON(ret);
+ } else
+ err = -ENOENT;
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
btrfs_free_path(path);
- return ret;
+ return err;
}
-#endif
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -319,7 +402,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
return ret;
}
-
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +417,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len)
{
struct btrfs_key key;
@@ -346,13 +427,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
unsigned long ptr;
-
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
BUG_ON(ret);
@@ -366,6 +448,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
btrfs_free_path(path);
- return ret;
+ return 0;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..752a5463bf53 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
#include "export.h"
#include "compression.h"
-static struct super_operations btrfs_super_ops;
+static const struct super_operations btrfs_super_ops;
static void btrfs_put_super(struct super_block *sb)
{
@@ -66,7 +66,8 @@ enum {
Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
- Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+ Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+ Opt_discard, Opt_err,
};
static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
{Opt_notreelog, "notreelog"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
+ {Opt_discard, "discard"},
{Opt_err, NULL},
};
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->metadata_ratio);
}
break;
+ case Opt_discard:
+ btrfs_set_opt(info->mount_opt, DISCARD);
+ break;
default:
break;
}
@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
+#endif
tree_root = open_ctree(sb, fs_devices, (char *)data);
@@ -675,7 +682,8 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
-static struct super_operations btrfs_super_ops = {
+static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdbb5022da52..c207e8c32c9b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
- WARN_ON(root->root_item.refs == 0);
WARN_ON(root->commit_root != root->node);
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -164,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root)
}
}
+enum btrfs_trans_type {
+ TRANS_START,
+ TRANS_JOIN,
+ TRANS_USERSPACE,
+};
+
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- int num_blocks, int wait)
+ int num_blocks, int type)
{
struct btrfs_trans_handle *h =
kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -173,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
mutex_lock(&root->fs_info->trans_mutex);
if (!root->fs_info->log_root_recovering &&
- ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
wait_current_trans(root);
ret = join_transaction(root);
BUG_ON(ret);
@@ -187,6 +193,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
+
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
@@ -196,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, 1);
+ return start_transaction(root, num_blocks, TRANS_START);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, 0);
+ return start_transaction(root, num_blocks, TRANS_JOIN);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
- return start_transaction(r, num_blocks, 2);
+ return start_transaction(r, num_blocks, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -318,6 +327,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -339,10 +351,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * those extents are sent to disk but does not wait on them
*/
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
{
int ret;
int err = 0;
@@ -389,6 +401,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
page_cache_release(page);
}
}
+ if (err)
+ werr = err;
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
while (1) {
ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
EXTENT_DIRTY);
@@ -419,6 +454,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return werr;
}
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int ret2;
+
+ ret = btrfs_write_marked_extents(root, dirty_pages);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+ return ret || ret2;
+}
+
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -720,7 +771,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
key.objectid = objectid;
- key.offset = 0;
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
@@ -743,6 +795,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
+ btrfs_unreserve_metadata_space(root, 6);
return ret;
}
@@ -778,24 +831,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
pending->root_key.objectid,
- BTRFS_ROOT_BACKREF_KEY,
parent_root->root_key.objectid,
parent_inode->i_ino, index, pending->name,
namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- parent_root->root_key.objectid,
- BTRFS_ROOT_REF_KEY,
- pending->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
-
inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
d_instantiate(pending->dentry, inode);
fail:
@@ -874,7 +917,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
unsigned long timeout = 1;
struct btrfs_transaction *cur_trans;
struct btrfs_transaction *prev_trans = NULL;
- struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
int should_grow = 0;
@@ -915,13 +957,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
- if (!pinned_copy)
- return -ENOMEM;
-
- extent_io_tree_init(pinned_copy,
- root->fs_info->btree_inode->i_mapping, GFP_NOFS);
-
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1054,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans, root);
BUG_ON(ret);
+ btrfs_prepare_extent_commit(trans, root);
+
cur_trans = root->fs_info->running_transaction;
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = NULL;
@@ -1042,8 +1079,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
sizeof(root->fs_info->super_copy));
- btrfs_copy_pinned(root, pinned_copy);
-
trans->transaction->blocked = 0;
wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1094,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->fs_info->tree_log_mutex);
- btrfs_finish_extent_commit(trans, root, pinned_copy);
- kfree(pinned_copy);
+ btrfs_finish_extent_commit(trans, root);
/* do the directory inserts of any pending snapshot creations */
finish_pending_snapshots(trans, root->fs_info);
@@ -1078,6 +1112,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}
@@ -1096,8 +1133,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);
- list_del_init(&root->root_list);
- btrfs_drop_snapshot(root, 0);
+ list_del(&root->root_list);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ btrfs_drop_snapshot(root, 0);
+ else
+ btrfs_drop_snapshot(root, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c67404918..d4e3e7a6938c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..741666a7676a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (!root->log_start_pid) {
+ root->log_start_pid = current->pid;
+ root->log_multiple_pids = false;
+ } else if (root->log_start_pid != current->pid) {
+ root->log_multiple_pids = true;
+ }
+
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
return 0;
}
+ root->log_multiple_pids = false;
+ root->log_start_pid = current->pid;
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_update_pinned_extents(log->fs_info->extent_root,
- eb->start, eb->len, 1);
+ btrfs_pin_extent(log->fs_info->extent_root,
+ eb->start, eb->len, 0);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode,
- start, extent_end, extent_end, start, &alloc_hint);
+ start, extent_end, extent_end, start, &alloc_hint, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ u64 log_transid = 0;
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
unsigned long batch = root->log_batch;
- mutex_unlock(&root->log_mutex);
- schedule_timeout_uninterruptible(1);
- mutex_lock(&root->log_mutex);
-
+ if (root->log_multiple_pids) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
btrfs_set_root_node(&log->root_item, log->node);
root->log_batch = 0;
+ log_transid = root->log_transid;
root->log_transid++;
log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
smp_mb();
/*
* log tree has been flushed to disk, new modifications of
@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
BUG_ON(ret);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
btrfs_set_super_log_root(&root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
- write_ctree_super(trans, root->fs_info->tree_root, 2);
+ write_ctree_super(trans, root->fs_info->tree_root, 1);
ret = 0;
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
@@ -2605,7 +2629,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
extent);
cs = btrfs_file_extent_offset(src, extent);
cl = btrfs_file_extent_num_bytes(src,
- extent);;
+ extent);
if (btrfs_file_extent_compression(src,
extent)) {
cs = 0;
@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2852,6 +2876,21 @@ out:
return ret;
}
+static int inode_in_log(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == trans->transid &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ if (root != BTRFS_I(inode)->root ||
+ btrfs_root_refs(&root->root_item) == 0) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
ret = check_parent_dirs_for_sync(trans, inode, parent,
sb, last_committed);
if (ret)
goto end_no_trans;
+ if (inode_in_log(trans, inode)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
start_log_trans(trans, root);
ret = btrfs_log_inode(trans, root, inode, inode_only);
@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
break;
inode = parent->d_inode;
+ if (root != BTRFS_I(inode)->root)
+ break;
+
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
BUG_ON(ret);
}
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
- u64 highest_inode;
struct walk_control wc = {
.process_func = process_one_buffer,
.stage = 0,
@@ -3010,11 +3062,6 @@ again:
path);
BUG_ON(ret);
}
- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
- if (ret == 0) {
- wc.replay_dest->highest_inode = highest_inode;
- wc.replay_dest->last_inode_alloc = highest_inode;
- }
key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16b..0776eacb5083 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
num_run++;
batch_run++;
- if (bio_sync(cur))
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
if (need_resched()) {
@@ -276,7 +276,7 @@ loop_lock:
* is now congested. Back off and let other work structs
* run instead
*/
- if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
goto error;
device->name = kstrdup(orig_dev->name, GFP_NOFS);
- if (!device->name)
+ if (!device->name) {
+ kfree(device);
goto error;
+ }
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
@@ -719,10 +721,9 @@ error:
* called very infrequently and that a given device has a small number
* of extents
*/
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 num_bytes, u64 *start,
- u64 *max_avail)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
BUG_ON(ret);
@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
* step two, delete the device extents and the
* chunk tree entries
*/
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(em->start > chunk_offset ||
em->start + em->len < chunk_offset);
@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
kfree(map);
em->bdev = NULL;
@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_key found_key;
u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ if (ret == -ENOSPC)
+ failed++;
+ else if (ret)
+ BUG();
}
if (found_key.offset == 0)
@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
key.offset = found_key.offset - 1;
}
ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ WARN_ON(1);
+ ret = -ENOSPC;
+ }
error:
btrfs_free_path(path);
return ret;
@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
+ if (ret == -ENOSPC)
+ break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_chunk);
- key.offset = found_key.offset;
/* chunk zero is special */
- if (key.offset == 0)
+ if (found_key.offset == 0)
break;
btrfs_release_path(chunk_root, path);
@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ BUG_ON(ret && ret != -ENOSPC);
+ key.offset = found_key.offset - 1;
}
ret = 0;
error:
@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 chunk_offset;
int ret;
int slot;
+ int failed = 0;
+ bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
if (new_size >= device->total_bytes)
@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
-
path->reada = 2;
lock_chunks(root);
@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (device->writeable)
device->fs_devices->total_rw_bytes -= diff;
unlock_chunks(root);
- btrfs_end_transaction(trans, root);
+again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
if (ret) {
ret = 0;
+ btrfs_release_path(root, path);
break;
}
@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
- if (key.objectid != device->devid)
+ if (key.objectid != device->devid) {
+ btrfs_release_path(root, path);
break;
+ }
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (key.offset + length <= new_size)
+ if (key.offset + length <= new_size) {
+ btrfs_release_path(root, path);
break;
+ }
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
- if (ret)
+ if (ret && ret != -ENOSPC)
goto done;
+ if (ret == -ENOSPC)
+ failed++;
+ key.offset -= 1;
+ }
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ lock_chunks(root);
+
+ device->total_bytes = old_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ unlock_chunks(root);
+ goto done;
}
/* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2335,9 @@ again:
em->block_len = em->len;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
BUG_ON(ret);
free_extent_map(em);
@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
int readonly = 0;
int i;
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
if (!em)
return 1;
@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
struct extent_map *em;
while (1) {
- spin_lock(&tree->map_tree.lock);
+ write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
- spin_unlock(&tree->map_tree.lock);
+ write_unlock(&tree->map_tree.lock);
if (!em)
break;
kfree(em->bdev);
@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2645,9 @@ again:
atomic_set(&multi->error, 0);
}
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em && unplug_page)
return 0;
@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 stripe_nr;
int i, j, nr = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_start, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em || em->start != chunk_start);
map = (struct map_lookup *)em->bdev;
@@ -2903,7 +2944,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev->in_fs_metadata = 1;
}
- spin_lock(&map_tree->map_tree.lock);
+ write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
- spin_unlock(&map_tree->map_tree.lock);
+ write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret);
free_extent_map(em);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f721..31b0fabdd2ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
void btrfs_unlock_volumes(void);
void btrfs_lock_volumes(void);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b6dd5967c48a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
* attributes are handled directly.
*/
struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
&btrfs_xattr_acl_access_handler,
&btrfs_xattr_acl_default_handler,
#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..6fa530256bfd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,6 +52,7 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
bh->b_end_io = handler;
bh->b_private = private;
}
+EXPORT_SYMBOL(init_buffer);
static int sync_buffer(void *word)
{
@@ -80,6 +81,7 @@ void unlock_buffer(struct buffer_head *bh)
smp_mb__after_clear_bit();
wake_up_bit(&bh->b_state, BH_Lock);
}
+EXPORT_SYMBOL(unlock_buffer);
/*
* Block until a buffer comes unlocked. This doesn't stop it
@@ -90,6 +92,7 @@ void __wait_on_buffer(struct buffer_head * bh)
{
wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
}
+EXPORT_SYMBOL(__wait_on_buffer);
static void
__clear_page_buffers(struct page *page)
@@ -144,6 +147,7 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
__end_buffer_read_notouch(bh, uptodate);
put_bh(bh);
}
+EXPORT_SYMBOL(end_buffer_read_sync);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
@@ -164,6 +168,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
unlock_buffer(bh);
put_bh(bh);
}
+EXPORT_SYMBOL(end_buffer_write_sync);
/*
* Various filesystems appear to want __find_get_block to be non-blocking.
@@ -272,16 +277,17 @@ void invalidate_bdev(struct block_device *bdev)
invalidate_bh_lrus();
invalidate_mapping_pages(mapping, 0, -1);
}
+EXPORT_SYMBOL(invalidate_bdev);
/*
- * Kick pdflush then try to free up some ZONE_NORMAL memory.
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
*/
static void free_more_memory(void)
{
struct zone *zone;
int nid;
- wakeup_pdflush(1024);
+ wakeup_flusher_threads(1024);
yield();
for_each_online_node(nid) {
@@ -410,6 +416,7 @@ still_busy:
local_irq_restore(flags);
return;
}
+EXPORT_SYMBOL(end_buffer_async_write);
/*
* If a page's buffers are under async readin (end_buffer_async_read
@@ -438,8 +445,8 @@ static void mark_buffer_async_read(struct buffer_head *bh)
set_buffer_async_read(bh);
}
-void mark_buffer_async_write_endio(struct buffer_head *bh,
- bh_end_io_t *handler)
+static void mark_buffer_async_write_endio(struct buffer_head *bh,
+ bh_end_io_t *handler)
{
bh->b_end_io = handler;
set_buffer_async_write(bh);
@@ -553,7 +560,7 @@ repeat:
return err;
}
-void do_thaw_all(struct work_struct *work)
+static void do_thaw_all(struct work_struct *work)
{
struct super_block *sb;
char b[BDEVNAME_SIZE];
@@ -1172,6 +1179,7 @@ void mark_buffer_dirty(struct buffer_head *bh)
}
}
}
+EXPORT_SYMBOL(mark_buffer_dirty);
/*
* Decrement a buffer_head's reference count. If all buffers against a page
@@ -1188,6 +1196,7 @@ void __brelse(struct buffer_head * buf)
}
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
+EXPORT_SYMBOL(__brelse);
/*
* bforget() is like brelse(), except it discards any
@@ -1206,6 +1215,7 @@ void __bforget(struct buffer_head *bh)
}
__brelse(bh);
}
+EXPORT_SYMBOL(__bforget);
static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
@@ -1699,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
/*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
- * potentially cause a busy-wait loop from pdflush and kswapd
- * activity, but those code paths have their own higher-level
- * throttling.
+ * potentially cause a busy-wait loop from writeback threads
+ * and kswapd activity, but those code paths have their own
+ * higher-level throttling.
*/
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
@@ -2218,6 +2228,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
}
return 0;
}
+EXPORT_SYMBOL(block_read_full_page);
/* utility function for filesystems that need to do work on expanding
* truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2228,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
- unsigned long limit;
int err;
- err = -EFBIG;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && size > (loff_t)limit) {
- send_sig(SIGXFSZ, current, 0);
- goto out;
- }
- if (size > inode->i_sb->s_maxbytes)
+ err = inode_newsize_ok(inode, size);
+ if (err)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
@@ -2252,6 +2257,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
out:
return err;
}
+EXPORT_SYMBOL(generic_cont_expand_simple);
static int cont_expand_zero(struct file *file, struct address_space *mapping,
loff_t pos, loff_t *bytes)
@@ -2352,6 +2358,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
out:
return err;
}
+EXPORT_SYMBOL(cont_write_begin);
int block_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
@@ -2362,6 +2369,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
ClearPageUptodate(page);
return err;
}
+EXPORT_SYMBOL(block_prepare_write);
int block_commit_write(struct page *page, unsigned from, unsigned to)
{
@@ -2369,6 +2377,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
__block_commit_write(inode,page,from,to);
return 0;
}
+EXPORT_SYMBOL(block_commit_write);
/*
* block_page_mkwrite() is not allowed to change the file size as it gets
@@ -2426,6 +2435,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
out:
return ret;
}
+EXPORT_SYMBOL(block_page_mkwrite);
/*
* nobh_write_begin()'s prereads are special: the buffer_heads are freed
@@ -2849,6 +2859,7 @@ unlock:
out:
return err;
}
+EXPORT_SYMBOL(block_truncate_page);
/*
* The generic ->writepage function for buffer-backed address_spaces
@@ -2890,6 +2901,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
return __block_write_full_page(inode, page, get_block, wbc, handler);
}
+EXPORT_SYMBOL(block_write_full_page_endio);
/*
* The generic ->writepage function for buffer-backed address_spaces
@@ -2900,7 +2912,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
return block_write_full_page_endio(page, get_block, wbc,
end_buffer_async_write);
}
-
+EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
@@ -2913,6 +2925,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block(inode, block, &tmp, 0);
return tmp.b_blocknr;
}
+EXPORT_SYMBOL(generic_block_bmap);
static void end_bio_bh_io_sync(struct bio *bio, int err)
{
@@ -2982,6 +2995,7 @@ int submit_bh(int rw, struct buffer_head * bh)
bio_put(bio);
return ret;
}
+EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
@@ -3043,6 +3057,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
unlock_buffer(bh);
}
}
+EXPORT_SYMBOL(ll_rw_block);
/*
* For a data-integrity writeout, we need to wait upon any in-progress I/O
@@ -3071,6 +3086,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
}
return ret;
}
+EXPORT_SYMBOL(sync_dirty_buffer);
/*
* try_to_free_buffers() checks if all the buffers on this particular page
@@ -3185,13 +3201,14 @@ void block_sync_page(struct page *page)
if (mapping)
blk_run_backing_dev(mapping->backing_dev_info, page);
}
+EXPORT_SYMBOL(block_sync_page);
/*
* There are no bdflush tunables left. But distributions are
* still running obsolete flush daemons, so we terminate them here.
*
* Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `pdflush' kernel threads fully replace bdflush daemons and this call.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
*/
SYSCALL_DEFINE2(bdflush, int, func, long, data)
{
@@ -3361,29 +3378,3 @@ void __init buffer_init(void)
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
hotcpu_notifier(buffer_cpu_notify, 0);
}
-
-EXPORT_SYMBOL(__bforget);
-EXPORT_SYMBOL(__brelse);
-EXPORT_SYMBOL(__wait_on_buffer);
-EXPORT_SYMBOL(block_commit_write);
-EXPORT_SYMBOL(block_prepare_write);
-EXPORT_SYMBOL(block_page_mkwrite);
-EXPORT_SYMBOL(block_read_full_page);
-EXPORT_SYMBOL(block_sync_page);
-EXPORT_SYMBOL(block_truncate_page);
-EXPORT_SYMBOL(block_write_full_page);
-EXPORT_SYMBOL(block_write_full_page_endio);
-EXPORT_SYMBOL(cont_write_begin);
-EXPORT_SYMBOL(end_buffer_read_sync);
-EXPORT_SYMBOL(end_buffer_write_sync);
-EXPORT_SYMBOL(end_buffer_async_write);
-EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(generic_block_bmap);
-EXPORT_SYMBOL(generic_cont_expand_simple);
-EXPORT_SYMBOL(init_buffer);
-EXPORT_SYMBOL(invalidate_bdev);
-EXPORT_SYMBOL(ll_rw_block);
-EXPORT_SYMBOL(mark_buffer_dirty);
-EXPORT_SYMBOL(submit_bh);
-EXPORT_SYMBOL(sync_dirty_buffer);
-EXPORT_SYMBOL(unlock_buffer);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..bc1fbd956187
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,26 @@
+config CEPH_FS
+ tristate "Ceph distributed file system (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ select LIBCRC32C
+ help
+ Choose Y or M here to include support for mounting the
+ experimental Ceph distributed file system. Ceph is an extremely
+ scalable file system designed to provide high performance,
+ reliable access to petabytes of storage.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_FS
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This icnreases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..827629c85768
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ export.o caps.o snap.o xattr.o \
+ messenger.o msgpool.o buffer.o \
+ mds_client.o mdsmap.o \
+ mon_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland kernel
+src/include/ceph_fs.h fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc fs/ceph/ceph_fs.c
+src/include/msgr.h fs/ceph/msgr.h
+src/include/rados.h fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc fs/ceph/ceph_frag.c
+src/include/ceph_hash.h fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc fs/ceph/ceph_hash.c
+src/crush/crush.c fs/ceph/crush/crush.c
+src/crush/crush.h fs/ceph/crush/crush.h
+src/crush/mapper.c fs/ceph/crush/mapper.c
+src/crush/mapper.h fs/ceph/crush/mapper.h
+src/crush/hash.h fs/ceph/crush/hash.h
+src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..bf535815592d
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1115 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h> /* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page. This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode. In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress. In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_. Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+
+/*
+ * Dirty a page. Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate. If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ int undo = 0;
+ struct ceph_snap_context *snapc;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ if (TestSetPageDirty(page)) {
+ dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+ mapping->host, page, page->index);
+ return 0;
+ }
+
+ inode = mapping->host;
+ ci = ceph_inode(inode);
+
+ /*
+ * Note that we're grabbing a snapc ref here without holding
+ * any locks!
+ */
+ snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+ /* dirty the head */
+ spin_lock(&inode->i_lock);
+ if (ci->i_wrbuffer_ref_head == 0)
+ ci->i_head_snapc = ceph_get_snap_context(snapc);
+ ++ci->i_wrbuffer_ref_head;
+ if (ci->i_wrbuffer_ref == 0)
+ igrab(inode);
+ ++ci->i_wrbuffer_ref;
+ dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ mapping->host, page, page->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
+ spin_unlock(&inode->i_lock);
+
+ /* now adjust page */
+ spin_lock_irq(&mapping->tree_lock);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ }
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+
+ /*
+ * Reference snap context in page->private. Also set
+ * PagePrivate so that we get invalidatepage callback.
+ */
+ page->private = (unsigned long)snapc;
+ SetPagePrivate(page);
+ } else {
+ dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+ undo = 1;
+ }
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (undo)
+ /* whoops, we failed to dirty the page */
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ BUG_ON(!PageDirty(page));
+ return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately. Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_context *snapc = (void *)page->private;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!page->private);
+ BUG_ON(!PagePrivate(page));
+ BUG_ON(!page->mapping);
+
+ /*
+ * We can get non-dirty pages here due to races between
+ * set_page_dirty and truncate_complete_page; just spit out a
+ * warning, in case we end up with accounting problems later.
+ */
+ if (!PageDirty(page))
+ pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ ci = ceph_inode(inode);
+ if (offset == 0) {
+ dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+ inode, page, page->index, offset);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
+ } else {
+ dout("%p invalidatepage %p idx %lu partial dirty page\n",
+ inode, page, page->index);
+ }
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+ struct inode *inode = page->mapping ? page->mapping->host : NULL;
+ dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+ WARN_ON(PageDirty(page));
+ WARN_ON(page->private);
+ WARN_ON(PagePrivate(page));
+ return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ int err = 0;
+ u64 len = PAGE_CACHE_SIZE;
+
+ dout("readpage inode %p file %p page %p index %lu\n",
+ inode, filp, page, page->index);
+ err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ page->index << PAGE_CACHE_SHIFT, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &page, 1);
+ if (err == -ENOENT)
+ err = 0;
+ if (err < 0) {
+ SetPageError(page);
+ goto out;
+ } else if (err < PAGE_CACHE_SIZE) {
+ /* zero fill remainder of page */
+ zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ }
+ SetPageUptodate(page);
+
+out:
+ return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+ int r = readpage_nounlock(filp, page);
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+ unsigned *nr_pages)
+{
+ struct page **pages;
+ struct page *page;
+ int next_index, contig_pages = 0;
+
+ /* build page vector */
+ pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ BUG_ON(list_empty(page_list));
+ next_index = list_entry(page_list->prev, struct page, lru)->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index == next_index) {
+ dout("readpages page %d %p\n", contig_pages, page);
+ pages[contig_pages] = page;
+ contig_pages++;
+ next_index++;
+ } else {
+ break;
+ }
+ }
+ *nr_pages = contig_pages;
+ return pages;
+}
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *page_list, unsigned nr_pages)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ int rc = 0;
+ struct page **pages;
+ struct pagevec pvec;
+ loff_t offset;
+ u64 len;
+
+ dout("readpages %p file %p nr_pages %d\n",
+ inode, file, nr_pages);
+
+ pages = page_vector_from_list(page_list, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /* guess read extent */
+ offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ len = nr_pages << PAGE_CACHE_SHIFT;
+ rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ offset, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ pages, nr_pages);
+ if (rc == -ENOENT)
+ rc = 0;
+ if (rc < 0)
+ goto out;
+
+ /* set uptodate and add to lru in pagevec-sized chunks */
+ pagevec_init(&pvec, 0);
+ for (; !list_empty(page_list) && len > 0;
+ rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+ struct page *page =
+ list_entry(page_list->prev, struct page, lru);
+
+ list_del(&page->lru);
+
+ if (rc < (int)PAGE_CACHE_SIZE) {
+ /* zero (remainder of) page */
+ int s = rc < 0 ? 0 : rc;
+ zero_user_segment(page, s, PAGE_CACHE_SIZE);
+ }
+
+ if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+ page_cache_release(page);
+ dout("readpages %p add_to_page_cache failed %p\n",
+ inode, page);
+ continue;
+ }
+ dout("readpages %p adding %p idx %lu\n", inode, page,
+ page->index);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ if (pagevec_add(&pvec, page) == 0)
+ pagevec_lru_add_file(&pvec); /* add to lru */
+ }
+ pagevec_lru_add_file(&pvec);
+ rc = 0;
+
+out:
+ kfree(pages);
+ return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+ u64 *snap_size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
+
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+ capsnap->context, capsnap->dirty_pages);
+ if (capsnap->dirty_pages) {
+ snapc = ceph_get_snap_context(capsnap->context);
+ if (snap_size)
+ *snap_size = capsnap->size;
+ break;
+ }
+ }
+ if (!snapc && ci->i_snap_realm) {
+ snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+ dout(" head snapc %p has %d dirty pages\n",
+ snapc, ci->i_wrbuffer_ref_head);
+ }
+ return snapc;
+}
+
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+ u64 *snap_size)
+{
+ struct ceph_snap_context *snapc = NULL;
+
+ spin_lock(&inode->i_lock);
+ snapc = __get_oldest_context(inode, snap_size);
+ spin_unlock(&inode->i_lock);
+ return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_osd_client *osdc;
+ loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+ int len = PAGE_CACHE_SIZE;
+ loff_t i_size;
+ int err = 0;
+ struct ceph_snap_context *snapc;
+ u64 snap_size = 0;
+
+ dout("writepage %p idx %lu\n", page, page->index);
+
+ if (!page->mapping || !page->mapping->host) {
+ dout("writepage %p - no mapping\n", page);
+ return -EFAULT;
+ }
+ inode = page->mapping->host;
+ ci = ceph_inode(inode);
+ osdc = &ceph_inode_to_client(inode)->osdc;
+
+ /* verify this is a writeable snap context */
+ snapc = (void *)page->private;
+ if (snapc == NULL) {
+ dout("writepage %p page %p not dirty?\n", inode, page);
+ goto out;
+ }
+ if (snapc != get_oldest_context(inode, &snap_size)) {
+ dout("writepage %p page %p snapc %p not writeable - noop\n",
+ inode, page, (void *)page->private);
+ /* we should only noop if called by kswapd */
+ WARN_ON((current->flags & PF_MEMALLOC) == 0);
+ goto out;
+ }
+
+ /* is this a partial page at end of file? */
+ if (snap_size)
+ i_size = snap_size;
+ else
+ i_size = i_size_read(inode);
+ if (i_size < page_off + len)
+ len = i_size - page_off;
+
+ dout("writepage %p page %p index %lu on %llu~%u\n",
+ inode, page, page->index, page_off, len);
+
+ set_page_writeback(page);
+ err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+ &ci->i_layout, snapc,
+ page_off, len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &inode->i_mtime,
+ &page, 1, 0, 0, true);
+ if (err < 0) {
+ dout("writepage setting page/mapping error %d %p\n", err, page);
+ SetPageError(page);
+ mapping_set_error(&inode->i_data, err);
+ if (wbc)
+ wbc->pages_skipped++;
+ } else {
+ dout("writepage cleaned page %p\n", page);
+ err = 0; /* vfs expects us to return 0 */
+ }
+ page->private = 0;
+ ClearPagePrivate(page);
+ end_page_writeback(page);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+out:
+ return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int err = writepage_nounlock(page, wbc);
+ unlock_page(page);
+ return err;
+}
+
+
+/*
+ * lame release_pages helper. release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+ struct pagevec pvec;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ for (i = 0; i < num; i++) {
+ if (pagevec_add(&pvec, pages[i]) == 0)
+ pagevec_release(&pvec);
+ }
+ pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ struct inode *inode = req->r_inode;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned wrote;
+ loff_t offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+ struct page *page;
+ int i;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control *wbc = req->r_wbc;
+ __s32 rc = -EIO;
+ u64 bytes = 0;
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+
+ if (rc >= 0) {
+ wrote = (bytes + (offset & ~PAGE_CACHE_MASK) + ~PAGE_CACHE_MASK)
+ >> PAGE_CACHE_SHIFT;
+ WARN_ON(wrote != req->r_num_pages);
+ } else {
+ wrote = 0;
+ mapping_set_error(mapping, rc);
+ }
+ dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+ inode, rc, bytes, wrote);
+
+ /* clean all pages */
+ for (i = 0; i < req->r_num_pages; i++) {
+ page = req->r_pages[i];
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
+
+ if (i >= wrote) {
+ dout("inode %p skipping page %p\n", inode, page);
+ wbc->pages_skipped++;
+ }
+ page->private = 0;
+ ClearPagePrivate(page);
+ ceph_put_snap_context(snapc);
+ dout("unlocking %d %p\n", i, page);
+ end_page_writeback(page);
+ unlock_page(page);
+ }
+ dout("%p wrote+cleaned %d pages\n", inode, wrote);
+ ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+ ceph_release_pages(req->r_pages, req->r_num_pages);
+ if (req->r_pages_from_pool)
+ mempool_free(req->r_pages,
+ ceph_client(inode->i_sb)->wb_pagevec_pool);
+ else
+ kfree(req->r_pages);
+ ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool. we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+ struct ceph_osd_request *req)
+{
+ req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+ GFP_NOFS);
+ if (!req->r_pages) {
+ req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+ req->r_pages_from_pool = 1;
+ WARN_ON(!req->r_pages);
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ pgoff_t index, start, end;
+ int range_whole = 0;
+ int should_loop = 1;
+ pgoff_t max_pages = 0, max_pages_ever = 0;
+ struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+ struct pagevec pvec;
+ int done = 0;
+ int rc = 0;
+ unsigned wsize = 1 << inode->i_blkbits;
+ struct ceph_osd_request *req = NULL;
+ int do_sync;
+ u64 snap_size = 0;
+
+ /*
+ * Include a 'sync' in the OSD request if this is a data
+ * integrity write (e.g., O_SYNC write or fsync()), or if our
+ * cap is being revoked.
+ */
+ do_sync = wbc->sync_mode == WB_SYNC_ALL;
+ if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ do_sync = 1;
+ dout("writepages_start %p dosync=%d (mode=%s)\n",
+ inode, do_sync,
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ client = ceph_inode_to_client(inode);
+ if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ pr_warning("writepage_start %p on forced umount\n", inode);
+ return -EIO; /* we're in a forced umount, don't write! */
+ }
+ if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+ wsize = client->mount_args->wsize;
+ if (wsize < PAGE_CACHE_SIZE)
+ wsize = PAGE_CACHE_SIZE;
+ max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+
+ /* ?? */
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ dout(" writepages congested\n");
+ wbc->encountered_congestion = 1;
+ goto out_final;
+ }
+
+ /* where to start/end? */
+ if (wbc->range_cyclic) {
+ start = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ dout(" cyclic, start at %lu\n", start);
+ } else {
+ start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ should_loop = 0;
+ dout(" not cyclic, %lu to %lu\n", start, end);
+ }
+ index = start;
+
+retry:
+ /* find oldest snap context with dirty data */
+ ceph_put_snap_context(snapc);
+ snapc = get_oldest_context(inode, &snap_size);
+ if (!snapc) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ dout(" no snap context with dirty data?\n");
+ goto out;
+ }
+ dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+ snapc, snapc->seq, snapc->num_snaps);
+ if (last_snapc && snapc != last_snapc) {
+ /* if we switched to a newer snapc, restart our scan at the
+ * start of the original file range. */
+ dout(" snapc differs from last pass, restarting at %lu\n",
+ index);
+ index = start;
+ }
+ last_snapc = snapc;
+
+ while (!done && index <= end) {
+ unsigned i;
+ int first;
+ pgoff_t next;
+ int pvec_pages, locked_pages;
+ struct page *page;
+ int want;
+ u64 offset, len;
+ struct ceph_osd_request_head *reqhead;
+ struct ceph_osd_op *op;
+
+ next = 0;
+ locked_pages = 0;
+ max_pages = max_pages_ever;
+
+get_more_pages:
+ first = -1;
+ want = min(end - index,
+ min((pgoff_t)PAGEVEC_SIZE,
+ max_pages - (pgoff_t)locked_pages) - 1)
+ + 1;
+ pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ want);
+ dout("pagevec_lookup_tag got %d\n", pvec_pages);
+ if (!pvec_pages && !locked_pages)
+ break;
+ for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+ page = pvec.pages[i];
+ dout("? %p idx %lu\n", page, page->index);
+ if (locked_pages == 0)
+ lock_page(page); /* first page */
+ else if (!trylock_page(page))
+ break;
+
+ /* only dirty pages, or our accounting breaks */
+ if (unlikely(!PageDirty(page)) ||
+ unlikely(page->mapping != mapping)) {
+ dout("!dirty or !mapping %p\n", page);
+ unlock_page(page);
+ break;
+ }
+ if (!wbc->range_cyclic && page->index > end) {
+ dout("end of range %p\n", page);
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (next && (page->index != next)) {
+ dout("not consecutive %p\n", page);
+ unlock_page(page);
+ break;
+ }
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ dout("waiting on writeback %p\n", page);
+ wait_on_page_writeback(page);
+ }
+ if ((snap_size && page_offset(page) > snap_size) ||
+ (!snap_size &&
+ page_offset(page) > i_size_read(inode))) {
+ dout("%p page eof %llu\n", page, snap_size ?
+ snap_size : i_size_read(inode));
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (PageWriteback(page)) {
+ dout("%p under writeback\n", page);
+ unlock_page(page);
+ break;
+ }
+
+ /* only if matching snap context */
+ if (snapc != (void *)page->private) {
+ dout("page snapc %p != oldest %p\n",
+ (void *)page->private, snapc);
+ unlock_page(page);
+ if (!locked_pages)
+ continue; /* keep looking for snap */
+ break;
+ }
+
+ if (!clear_page_dirty_for_io(page)) {
+ dout("%p !clear_page_dirty_for_io\n", page);
+ unlock_page(page);
+ break;
+ }
+
+ /* ok */
+ if (locked_pages == 0) {
+ /* prepare async write request */
+ offset = page->index << PAGE_CACHE_SHIFT;
+ len = wsize;
+ req = ceph_osdc_new_request(&client->osdc,
+ &ci->i_layout,
+ ceph_vino(inode),
+ offset, &len,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, do_sync,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ &inode->i_mtime, true, 1);
+ max_pages = req->r_num_pages;
+
+ alloc_page_vec(client, req);
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+ req->r_wbc = wbc;
+ }
+
+ /* note position of first page in pvec */
+ if (first < 0)
+ first = i;
+ dout("%p will write page %p idx %lu\n",
+ inode, page, page->index);
+ set_page_writeback(page);
+ req->r_pages[locked_pages] = page;
+ locked_pages++;
+ next = page->index + 1;
+ }
+
+ /* did we get anything? */
+ if (!locked_pages)
+ goto release_pvec_pages;
+ if (i) {
+ int j;
+ BUG_ON(!locked_pages || first < 0);
+
+ if (pvec_pages && i == pvec_pages &&
+ locked_pages < max_pages) {
+ dout("reached end pvec, trying for more\n");
+ pagevec_reinit(&pvec);
+ goto get_more_pages;
+ }
+
+ /* shift unused pages over in the pvec... we
+ * will need to release them below. */
+ for (j = i; j < pvec_pages; j++) {
+ dout(" pvec leftover page %p\n",
+ pvec.pages[j]);
+ pvec.pages[j-i+first] = pvec.pages[j];
+ }
+ pvec.nr -= i-first;
+ }
+
+ /* submit the write */
+ offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+ len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+ (u64)locked_pages << PAGE_CACHE_SHIFT);
+ dout("writepages got %d pages at %llu~%llu\n",
+ locked_pages, offset, len);
+
+ /* revise final length, page count */
+ req->r_num_pages = locked_pages;
+ reqhead = req->r_request->front.iov_base;
+ op = (void *)(reqhead + 1);
+ op->extent.length = cpu_to_le64(len);
+ op->payload_len = cpu_to_le32(len);
+ req->r_request->hdr.data_len = cpu_to_le32(len);
+
+ ceph_osdc_start_request(&client->osdc, req, true);
+ req = NULL;
+
+ /* continue? */
+ index = next;
+ wbc->nr_to_write -= locked_pages;
+ if (wbc->nr_to_write <= 0)
+ done = 1;
+
+release_pvec_pages:
+ dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+ pvec.nr ? pvec.pages[0] : NULL);
+ pagevec_release(&pvec);
+
+ if (locked_pages && !done)
+ goto retry;
+ }
+
+ if (should_loop && !done) {
+ /* more to do; loop back to beginning of file */
+ dout("writepages looping back to beginning of file\n");
+ should_loop = 0;
+ index = 0;
+ goto retry;
+ }
+
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+
+out:
+ if (req)
+ ceph_osdc_put_request(req);
+ if (rc > 0)
+ rc = 0; /* vfs expects us to return 0 */
+ ceph_put_snap_context(snapc);
+ dout("writepages done, rc = %d\n", rc);
+out_final:
+ return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+ struct ceph_snap_context *snapc)
+{
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+ return !oldest || snapc->seq <= oldest->seq;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct page *page;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ loff_t page_off = pos & PAGE_CACHE_MASK;
+ int pos_in_page = pos & ~PAGE_CACHE_MASK;
+ int end_in_page = pos_in_page + len;
+ loff_t i_size;
+ struct ceph_snap_context *snapc;
+ int r;
+
+ /* get a page*/
+retry:
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ dout("write_begin file %p inode %p page %p %d~%d\n", file,
+ inode, page, (int)pos, (int)len);
+
+retry_locked:
+ /* writepages currently holds page lock, but if we change that later, */
+ wait_on_page_writeback(page);
+
+ /* check snap context */
+ BUG_ON(!ci->i_snap_realm);
+ down_read(&mdsc->snap_rwsem);
+ BUG_ON(!ci->i_snap_realm->cached_context);
+ if (page->private &&
+ (void *)page->private != ci->i_snap_realm->cached_context) {
+ /*
+ * this page is already dirty in another (older) snap
+ * context! is it writeable now?
+ */
+ snapc = get_oldest_context(inode, NULL);
+ up_read(&mdsc->snap_rwsem);
+
+ if (snapc != (void *)page->private) {
+ dout(" page %p snapc %p not current or oldest\n",
+ page, (void *)page->private);
+ /*
+ * queue for writeback, and wait for snapc to
+ * be writeable or written
+ */
+ snapc = ceph_get_snap_context((void *)page->private);
+ unlock_page(page);
+ if (ceph_queue_writeback(inode))
+ igrab(inode);
+ wait_event_interruptible(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ goto retry;
+ }
+
+ /* yay, writeable, do it now (without dropping page lock) */
+ dout(" page %p snapc %p not current, but oldest\n",
+ page, snapc);
+ if (!clear_page_dirty_for_io(page))
+ goto retry_locked;
+ r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ goto fail_nosnap;
+ goto retry_locked;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ return 0;
+ }
+
+ /* full page? */
+ if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+ return 0;
+
+ /* past end of file? */
+ i_size = inode->i_size; /* caller holds i_mutex */
+
+ if (i_size + len > inode->i_sb->s_maxbytes) {
+ /* file is too big */
+ r = -EINVAL;
+ goto fail;
+ }
+
+ if (page_off >= i_size ||
+ (pos_in_page == 0 && (pos+len) >= i_size &&
+ end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+ dout(" zeroing %p 0 - %d and %d - %d\n",
+ page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+ zero_user_segments(page,
+ 0, pos_in_page,
+ end_in_page, PAGE_CACHE_SIZE);
+ return 0;
+ }
+
+ /* we need to read it. */
+ up_read(&mdsc->snap_rwsem);
+ r = readpage_nounlock(file, page);
+ if (r < 0)
+ goto fail_nosnap;
+ goto retry_locked;
+
+fail:
+ up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ int check_cap = 0;
+
+ dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+ inode, page, (int)pos, (int)copied, (int)len);
+
+ /* zero the stale part of the page if we did a short copy */
+ if (copied < len)
+ zero_user_segment(page, from+copied, len);
+
+ /* did file size increase? */
+ /* (no need for i_size_read(); we caller holds i_mutex */
+ if (pos+copied > inode->i_size)
+ check_cap = ceph_inode_set_size(inode, pos+copied);
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ set_page_dirty(page);
+
+ unlock_page(page);
+ up_read(&mdsc->snap_rwsem);
+ page_cache_release(page);
+
+ if (check_cap)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+ return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t pos, unsigned long nr_segs)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+ .readpage = ceph_readpage,
+ .readpages = ceph_readpages,
+ .writepage = ceph_writepage,
+ .writepages = ceph_writepages_start,
+ .write_begin = ceph_write_begin,
+ .write_end = ceph_write_end,
+ .set_page_dirty = ceph_set_page_dirty,
+ .invalidatepage = ceph_invalidatepage,
+ .releasepage = ceph_releasepage,
+ .direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct page *page = vmf->page;
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ loff_t off = page->index << PAGE_CACHE_SHIFT;
+ loff_t size, len;
+ struct page *locked_page = NULL;
+ void *fsdata = NULL;
+ int ret;
+
+ size = i_size_read(inode);
+ if (off + PAGE_CACHE_SIZE <= size)
+ len = PAGE_CACHE_SIZE;
+ else
+ len = size & ~PAGE_CACHE_MASK;
+
+ dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+ off, len, page, page->index);
+ ret = ceph_write_begin(vma->vm_file, inode->i_mapping, off, len, 0,
+ &locked_page, &fsdata);
+ WARN_ON(page != locked_page);
+ if (!ret) {
+ /*
+ * doing the following, instead of calling
+ * ceph_write_end. Note that we keep the
+ * page locked
+ */
+ set_page_dirty(page);
+ up_read(&mdsc->snap_rwsem);
+ page_cache_release(page);
+ ret = VM_FAULT_LOCKED;
+ } else {
+ ret = VM_FAULT_SIGBUS;
+ }
+ dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+ return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ceph_vmops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..32f2e2a021ab
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,225 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/err.h>
+
+#include "types.h"
+#include "auth_none.h"
+#include "decode.h"
+#include "super.h"
+
+#include "messenger.h"
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+ CEPH_AUTH_NONE
+};
+
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+ switch (protocol) {
+ case CEPH_AUTH_NONE:
+ return ceph_auth_none_init(ac);
+ default:
+ return -ENOENT;
+ }
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+ struct ceph_auth_client *ac;
+ int ret;
+
+ dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+ ret = -ENOMEM;
+ ac = kzalloc(sizeof(*ac), GFP_NOFS);
+ if (!ac)
+ goto out;
+
+ ac->negotiating = true;
+ if (name)
+ ac->name = name;
+ else
+ ac->name = CEPH_AUTH_NAME_DEFAULT;
+ dout("auth_init name %s secret %s\n", ac->name, secret);
+ ac->secret = secret;
+ return ac;
+
+out:
+ return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+ dout("auth_destroy %p\n", ac);
+ if (ac->ops)
+ ac->ops->destroy(ac);
+ kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+ dout("auth_reset %p\n", ac);
+ if (ac->ops && !ac->negotiating)
+ ac->ops->reset(ac);
+ ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+ int len = strlen(name);
+
+ if (*p + 2*sizeof(u32) + len > end)
+ return -ERANGE;
+ ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_32(p, len);
+ ceph_encode_copy(p, name, len);
+ return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor. Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+ struct ceph_mon_request_header *monhdr = buf;
+ void *p = monhdr + 1, *end = buf + len, *lenp;
+ int i, num;
+ int ret;
+
+ dout("auth_build_hello\n");
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, 0); /* no protocol, yet */
+
+ lenp = p;
+ p += sizeof(u32);
+
+ num = ARRAY_SIZE(supported_protocols);
+ ceph_encode_32(&p, num);
+ for (i = 0; i < num; i++)
+ ceph_encode_32(&p, supported_protocols[i]);
+
+ ret = ceph_entity_name_encode(ac->name, &p, end);
+ if (ret < 0)
+ return ret;
+ ceph_decode_need(&p, end, sizeof(u64), bad);
+ ceph_encode_64(&p, ac->global_id);
+
+ ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+ return p - buf;
+
+bad:
+ return -ERANGE;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len)
+{
+ void *p = buf;
+ void *end = buf + len;
+ int protocol;
+ s32 result;
+ u64 global_id;
+ void *payload, *payload_end;
+ int payload_len;
+ char *result_msg;
+ int result_msg_len;
+ int ret = -EINVAL;
+
+ dout("handle_auth_reply %p %p\n", p, end);
+ ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+ protocol = ceph_decode_32(&p);
+ result = ceph_decode_32(&p);
+ global_id = ceph_decode_64(&p);
+ payload_len = ceph_decode_32(&p);
+ payload = p;
+ p += payload_len;
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ result_msg_len = ceph_decode_32(&p);
+ result_msg = p;
+ p += result_msg_len;
+ if (p != end)
+ goto bad;
+
+ dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+ result_msg, global_id, payload_len);
+
+ payload_end = payload + payload_len;
+
+ if (global_id && ac->global_id != global_id) {
+ dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+ ac->global_id = global_id;
+ }
+
+ if (ac->negotiating) {
+ /* server does not support our protocols? */
+ if (!protocol && result < 0) {
+ ret = result;
+ goto out;
+ }
+ /* set up (new) protocol handler? */
+ if (ac->protocol && ac->protocol != protocol) {
+ ac->ops->destroy(ac);
+ ac->protocol = 0;
+ ac->ops = NULL;
+ }
+ if (ac->protocol != protocol) {
+ ret = ceph_auth_init_protocol(ac, protocol);
+ if (ret) {
+ pr_err("error %d on auth protocol %d init\n",
+ ret, protocol);
+ goto out;
+ }
+ }
+ }
+
+ ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+ if (ret == -EAGAIN) {
+ struct ceph_mon_request_header *monhdr = reply_buf;
+ void *p = reply_buf + 1;
+ void *end = reply_buf + reply_len;
+
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, ac->protocol);
+
+ ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+ if (ret < 0) {
+ pr_err("error %d building request\n", ret);
+ goto out;
+ }
+ dout(" built request %d bytes\n", ret);
+ ceph_encode_32(&p, ret);
+ return p + ret - reply_buf;
+ } else if (ret) {
+ pr_err("authentication error %d\n", ret);
+ return ret;
+ }
+ return 0;
+
+bad:
+ pr_err("failed to decode auth msg\n");
+out:
+ return ret;
+}
+
+
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..4d8cdf6bb3b6
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,77 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys. These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+ /*
+ * true if we are authenticated and can connect to
+ * services.
+ */
+ int (*is_authenticated)(struct ceph_auth_client *ac);
+
+ /*
+ * build requests and process replies during monitor
+ * handshake. if handle_reply returns -EAGAIN, we build
+ * another request.
+ */
+ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+ int (*handle_reply)(struct ceph_auth_client *ac, int result,
+ void *buf, void *end);
+
+ /*
+ * Create authorizer for connecting to a service, and verify
+ * the response to authenticate the service.
+ */
+ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len);
+ int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len);
+ void (*destroy_authorizer)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a);
+
+ /* reset when we (re)connect to a monitor */
+ void (*reset)(struct ceph_auth_client *ac);
+
+ void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+ u32 protocol; /* CEPH_AUTH_* */
+ void *private; /* for use by protocol implementation */
+ const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
+
+ bool negotiating; /* true if negotiating protocol */
+ const char *name; /* entity name */
+ u64 global_id; /* our unique id in system */
+ const char *secret; /* our secret key */
+ unsigned want_keys; /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+ const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+ void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..631017eb7117
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,120 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return !xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = false;
+ return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id. we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len)
+{
+ struct ceph_auth_none_info *ai = ac->private;
+ struct ceph_none_authorizer *au = &ai->au;
+ void *p, *end;
+ int ret;
+
+ if (!ai->built_authorizer) {
+ p = au->buf;
+ end = p + sizeof(au->buf);
+ ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+ if (ret < 0)
+ goto bad;
+ ceph_decode_need(&p, end, sizeof(u64), bad2);
+ ceph_encode_64(&p, ac->global_id);
+ au->buf_len = p - (void *)au->buf;
+ ai->built_authorizer = true;
+ dout("built authorizer len %d\n", au->buf_len);
+ }
+
+ *a = (struct ceph_authorizer *)au;
+ *buf = au->buf;
+ *len = au->buf_len;
+ *reply_buf = au->reply_buf;
+ *reply_len = sizeof(au->reply_buf);
+ return 0;
+
+bad2:
+ ret = -ERANGE;
+bad:
+ return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .reset = reset,
+ .destroy = destroy,
+ .is_authenticated = is_authenticated,
+ .handle_reply = handle_reply,
+ .create_authorizer = ceph_auth_none_create_authorizer,
+ .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi;
+
+ dout("ceph_auth_none_init %p\n", ac);
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ return -ENOMEM;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+
+ ac->protocol = CEPH_AUTH_NONE;
+ ac->private = xi;
+ ac->ops = &ceph_auth_none_ops;
+ return 0;
+}
+
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+ char buf[128];
+ int buf_len;
+ char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+ bool starting;
+ bool built_authorizer;
+ struct ceph_none_authorizer au; /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..cf9aaccef22b
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,34 @@
+
+#include "ceph_debug.h"
+#include "buffer.h"
+
+struct ceph_buffer *ceph_buffer_new(gfp_t gfp)
+{
+ struct ceph_buffer *b;
+
+ b = kmalloc(sizeof(*b), gfp);
+ if (!b)
+ return NULL;
+ atomic_set(&b->nref, 1);
+ b->vec.iov_base = NULL;
+ b->vec.iov_len = 0;
+ b->alloc_len = 0;
+ return b;
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+ b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+ if (b->vec.iov_base) {
+ b->is_vmalloc = false;
+ } else {
+ b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+ b->is_vmalloc = true;
+ }
+ if (!b->vec.iov_base)
+ return -ENOMEM;
+ b->alloc_len = len;
+ b->vec.iov_len = len;
+ return 0;
+}
+
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..16b1930acc45
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,55 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+ atomic_t nref;
+ struct kvec vec;
+ size_t alloc_len;
+ bool is_vmalloc;
+};
+
+struct ceph_buffer *ceph_buffer_new(gfp_t gfp);
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+ atomic_inc(&b->nref);
+ return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+ if (b && atomic_dec_and_test(&b->nref)) {
+ if (b->vec.iov_base) {
+ if (b->is_vmalloc)
+ vfree(b->vec.iov_base);
+ else
+ kfree(b->vec.iov_base);
+ }
+ kfree(b);
+ }
+}
+
+static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp)
+{
+ struct ceph_buffer *b = ceph_buffer_new(gfp);
+
+ if (b && ceph_buffer_alloc(b, len, gfp) < 0) {
+ ceph_buffer_put(b);
+ b = NULL;
+ }
+ return b;
+}
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..9dd110602cda
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2863 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes). Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server. When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+ if (c & CEPH_CAP_GSHARED)
+ *s++ = 's';
+ if (c & CEPH_CAP_GEXCL)
+ *s++ = 'x';
+ if (c & CEPH_CAP_GCACHE)
+ *s++ = 'c';
+ if (c & CEPH_CAP_GRD)
+ *s++ = 'r';
+ if (c & CEPH_CAP_GWR)
+ *s++ = 'w';
+ if (c & CEPH_CAP_GBUFFER)
+ *s++ = 'b';
+ if (c & CEPH_CAP_GLAZYIO)
+ *s++ = 'l';
+ return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+ int i;
+ char *s;
+ int c;
+
+ spin_lock(&cap_str_lock);
+ i = last_cap_str++;
+ if (last_cap_str == MAX_CAP_STR)
+ last_cap_str = 0;
+ spin_unlock(&cap_str_lock);
+
+ s = cap_str[i];
+
+ if (caps & CEPH_CAP_PIN)
+ *s++ = 'p';
+
+ c = (caps >> CEPH_CAP_SAUTH) & 3;
+ if (c) {
+ *s++ = 'A';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SLINK) & 3;
+ if (c) {
+ *s++ = 'L';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SXATTR) & 3;
+ if (c) {
+ *s++ = 'X';
+ s = gcap_string(s, c);
+ }
+
+ c = caps >> CEPH_CAP_SFILE;
+ if (c) {
+ *s++ = 'F';
+ s = gcap_string(s, c);
+ }
+
+ if (s == cap_str[i])
+ *s++ = '-';
+ *s = 0;
+ return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations. This ensures that we preallocate
+ * memory needed to successfully process an MDS response. (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list; /* unused (reserved or unreserved) */
+static int caps_total_count; /* total caps allocated */
+static int caps_use_count; /* in use */
+static int caps_reserve_count; /* unused, reserved */
+static int caps_avail_count; /* unused, unreserved */
+
+void __init ceph_caps_init(void)
+{
+ INIT_LIST_HEAD(&caps_list);
+ spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+ struct ceph_cap *cap;
+
+ spin_lock(&caps_list_lock);
+ while (!list_empty(&caps_list)) {
+ cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ caps_total_count = 0;
+ caps_avail_count = 0;
+ caps_use_count = 0;
+ caps_reserve_count = 0;
+ spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+ int i;
+ struct ceph_cap *cap;
+ int have;
+ int alloc = 0;
+ LIST_HEAD(newcaps);
+ int ret = 0;
+
+ dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+ /* first reserve any caps that are already allocated */
+ spin_lock(&caps_list_lock);
+ if (caps_avail_count >= need)
+ have = need;
+ else
+ have = caps_avail_count;
+ caps_avail_count -= have;
+ caps_reserve_count += have;
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+
+ for (i = have; i < need; i++) {
+ cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (!cap) {
+ ret = -ENOMEM;
+ goto out_alloc_count;
+ }
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ }
+ BUG_ON(have + alloc != need);
+
+ spin_lock(&caps_list_lock);
+ caps_total_count += alloc;
+ caps_reserve_count += alloc;
+ list_splice(&newcaps, &caps_list);
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+
+ ctx->count = need;
+ dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+ ctx, caps_total_count, caps_use_count, caps_reserve_count,
+ caps_avail_count);
+ return 0;
+
+out_alloc_count:
+ /* we didn't manage to reserve as much as we needed */
+ pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have);
+ return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+ dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+ if (ctx->count) {
+ spin_lock(&caps_list_lock);
+ BUG_ON(caps_reserve_count < ctx->count);
+ caps_reserve_count -= ctx->count;
+ caps_avail_count += ctx->count;
+ ctx->count = 0;
+ dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+ caps_total_count, caps_use_count, caps_reserve_count,
+ caps_avail_count);
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+ }
+ return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+ struct ceph_cap *cap = NULL;
+
+ /* temporary, until we do something about cap import/export */
+ if (!ctx)
+ return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+
+ spin_lock(&caps_list_lock);
+ dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+ ctx, ctx->count, caps_total_count, caps_use_count,
+ caps_reserve_count, caps_avail_count);
+ BUG_ON(!ctx->count);
+ BUG_ON(ctx->count > caps_reserve_count);
+ BUG_ON(list_empty(&caps_list));
+
+ ctx->count--;
+ caps_reserve_count--;
+ caps_use_count++;
+
+ cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+ return cap;
+}
+
+static void put_cap(struct ceph_cap *cap,
+ struct ceph_cap_reservation *ctx)
+{
+ spin_lock(&caps_list_lock);
+ dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+ ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
+ caps_reserve_count, caps_avail_count);
+ caps_use_count--;
+ /*
+ * Keep some preallocated caps around, at least enough to do a
+ * readdir (which needs to preallocate lots of them), to avoid
+ * lots of free/alloc churn.
+ */
+ if (caps_avail_count >= caps_reserve_count +
+ ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) {
+ caps_total_count--;
+ kmem_cache_free(ceph_cap_cachep, cap);
+ } else {
+ if (ctx) {
+ ctx->count++;
+ caps_reserve_count++;
+ } else {
+ caps_avail_count++;
+ }
+ list_add(&cap->caps_item, &caps_list);
+ }
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+ int *total, int *avail, int *used, int *reserved)
+{
+ if (total)
+ *total = caps_total_count;
+ if (avail)
+ *avail = caps_avail_count;
+ if (used)
+ *used = caps_use_count;
+ if (reserved)
+ *reserved = caps_reserve_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+ struct ceph_cap *cap;
+ struct rb_node *n = ci->i_caps.rb_node;
+
+ while (n) {
+ cap = rb_entry(n, struct ceph_cap, ci_node);
+ if (mds < cap->mds)
+ n = n->rb_left;
+ else if (mds > cap->mds)
+ n = n->rb_right;
+ else
+ return cap;
+ }
+ return NULL;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+ struct ceph_cap *cap;
+ int mds = -1;
+ struct rb_node *p;
+
+ /* prefer mds with WR|WRBUFFER|EXCL caps */
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ mds = cap->mds;
+ if (mseq)
+ *mseq = cap->mseq;
+ if (cap->issued & (CEPH_CAP_FILE_WR |
+ CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_FILE_EXCL))
+ break;
+ }
+ return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+ int mds;
+ spin_lock(&inode->i_lock);
+ mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+ spin_unlock(&inode->i_lock);
+ return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+ struct ceph_cap *new)
+{
+ struct rb_node **p = &ci->i_caps.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap *cap = NULL;
+
+ while (*p) {
+ parent = *p;
+ cap = rb_entry(parent, struct ceph_cap, ci_node);
+ if (new->mds < cap->mds)
+ p = &(*p)->rb_left;
+ else if (new->mds > cap->mds)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->ci_node, parent, p);
+ rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS. Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ struct ceph_mount_args *ma = mdsc->client->mount_args;
+
+ ci->i_hold_caps_min = round_jiffies(jiffies +
+ ma->caps_wanted_delay_min * HZ);
+ ci->i_hold_caps_max = round_jiffies(jiffies +
+ ma->caps_wanted_delay_max * HZ);
+ dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ * -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ __cap_set_timeouts(mdsc, ci);
+ dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ ci->i_ceph_flags, ci->i_hold_caps_max);
+ if (!mdsc->stopping) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (!list_empty(&ci->i_cap_delay_list)) {
+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ goto no_change;
+ list_del_init(&ci->i_cap_delay_list);
+ }
+ list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+ spin_unlock(&mdsc->cap_delay_lock);
+ }
+}
+
+/*
+ * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ spin_lock(&mdsc->cap_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ if (list_empty(&ci->i_cap_delay_list))
+ return;
+ spin_lock(&mdsc->cap_delay_lock);
+ list_del_init(&ci->i_cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+ unsigned issued)
+{
+ unsigned had = __ceph_caps_issued(ci, NULL);
+
+ /*
+ * Each time we receive FILE_CACHE anew, we increment
+ * i_rdcache_gen.
+ */
+ if ((issued & CEPH_CAP_FILE_CACHE) &&
+ (had & CEPH_CAP_FILE_CACHE) == 0)
+ ci->i_rdcache_gen++;
+
+ /*
+ * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+ * don't know what happened to this directory while we didn't
+ * have the cap.
+ */
+ if ((issued & CEPH_CAP_FILE_SHARED) &&
+ (had & CEPH_CAP_FILE_SHARED) == 0) {
+ ci->i_shared_gen++;
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ }
+ }
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0. (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap_reservation *caps_reservation)
+{
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *new_cap = NULL;
+ struct ceph_cap *cap;
+ int mds = session->s_mds;
+ int actual_wanted;
+
+ dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+ session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+ /*
+ * If we are opening the file, include file mode wanted bits
+ * in wanted.
+ */
+ if (fmode >= 0)
+ wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (new_cap) {
+ cap = new_cap;
+ new_cap = NULL;
+ } else {
+ spin_unlock(&inode->i_lock);
+ new_cap = get_cap(caps_reservation);
+ if (new_cap == NULL)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ cap->issued = 0;
+ cap->implemented = 0;
+ cap->mds = mds;
+ cap->mds_wanted = 0;
+
+ cap->ci = ci;
+ __insert_cap_node(ci, cap);
+
+ /* clear out old exporting info? (i.e. on cap import) */
+ if (ci->i_cap_exporting_mds == mds) {
+ ci->i_cap_exporting_issued = 0;
+ ci->i_cap_exporting_mseq = 0;
+ ci->i_cap_exporting_mds = -1;
+ }
+
+ /* add to session cap list */
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ session->s_nr_caps++;
+ spin_unlock(&session->s_cap_lock);
+ }
+
+ if (!ci->i_snap_realm) {
+ /*
+ * add this inode to the appropriate snap realm
+ */
+ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+ realmino);
+ if (realm) {
+ ceph_get_snap_realm(mdsc, realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ ci->i_snap_realm = realm;
+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ spin_unlock(&realm->inodes_with_caps_lock);
+ } else {
+ pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+ realmino);
+ }
+ }
+
+ __check_cap_issue(ci, cap, issued);
+
+ /*
+ * If we are issued caps we don't want, or the mds' wanted
+ * value appears to be off, queue a check so we'll release
+ * later and/or update the mds wanted value.
+ */
+ actual_wanted = __ceph_caps_wanted(ci);
+ if ((wanted & ~actual_wanted) ||
+ (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+ dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
+ __cap_delay_requeue(mdsc, ci);
+ }
+
+ if (flags & CEPH_CAP_FLAG_AUTH)
+ ci->i_auth_cap = cap;
+ else if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
+ dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
+ cap->cap_id = cap_id;
+ cap->issued = issued;
+ cap->implemented |= issued;
+ cap->mds_wanted |= wanted;
+ cap->seq = seq;
+ cap->issue_seq = seq;
+ cap->mseq = mseq;
+ cap->cap_gen = session->s_cap_gen;
+
+ if (fmode >= 0)
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&inode->i_lock);
+ wake_up(&ci->i_cap_wq);
+ return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+ unsigned long ttl;
+ u32 gen;
+
+ spin_lock(&cap->session->s_cap_lock);
+ gen = cap->session->s_cap_gen;
+ ttl = cap->session->s_cap_ttl;
+ spin_unlock(&cap->session->s_cap_lock);
+
+ if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+ dout("__cap_is_valid %p cap %p issued %s "
+ "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us. Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+ int have = ci->i_snap_caps;
+ struct ceph_cap *cap;
+ struct rb_node *p;
+
+ if (implemented)
+ *implemented = 0;
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ dout("__ceph_caps_issued %p cap %p issued %s\n",
+ &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ have |= cap->issued;
+ if (implemented)
+ *implemented |= cap->implemented;
+ }
+ return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+ int have = ci->i_snap_caps;
+ struct ceph_cap *cap;
+ struct rb_node *p;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (cap == ocap)
+ continue;
+ if (!__cap_is_valid(cap))
+ continue;
+ have |= cap->issued;
+ }
+ return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+ struct ceph_mds_session *s = cap->session;
+
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ s->s_mds);
+ spin_lock(&s->s_cap_lock);
+ list_move_tail(&cap->session_caps, &s->s_caps);
+ spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask. If so, move the cap(s) to the
+ * front of their respective LRUs. (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int have = ci->i_snap_caps;
+
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p snap issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(have),
+ ceph_cap_string(mask));
+ return 1;
+ }
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ if ((cap->issued & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p cap %p issued %s"
+ " (mask %s)\n", &ci->vfs_inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch)
+ __touch_cap(cap);
+ return 1;
+ }
+
+ /* does a combination of caps satisfy mask? */
+ have |= cap->issued;
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p combo issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch) {
+ struct rb_node *q;
+
+ /* touch this + preceeding caps */
+ __touch_cap(cap);
+ for (q = rb_first(&ci->i_caps); q != p;
+ q = rb_next(q)) {
+ cap = rb_entry(q, struct ceph_cap,
+ ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ __touch_cap(cap);
+ }
+ }
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (__cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask)) {
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ dout("ceph_caps_revoking %p %s = %d\n", inode,
+ ceph_cap_string(mask), ret);
+ return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+ int used = 0;
+ if (ci->i_pin_ref)
+ used |= CEPH_CAP_PIN;
+ if (ci->i_rd_ref)
+ used |= CEPH_CAP_FILE_RD;
+ if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+ used |= CEPH_CAP_FILE_CACHE;
+ if (ci->i_wr_ref)
+ used |= CEPH_CAP_FILE_WR;
+ if (ci->i_wrbuffer_ref)
+ used |= CEPH_CAP_FILE_BUFFER;
+ return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+ int want = 0;
+ int mode;
+ for (mode = 0; mode < 4; mode++)
+ if (ci->i_nr_by_mode[mode])
+ want |= ceph_caps_for_mode(mode);
+ return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int mds_wanted = 0;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ mds_wanted |= cap->mds_wanted;
+ }
+ return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * caller should hold i_lock, and session s_mutex.
+ * returns true if this is the last cap. if so, caller should iput.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap,
+ struct ceph_cap_reservation *ctx)
+{
+ struct ceph_mds_session *session = cap->session;
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+ /* remove from session list */
+ spin_lock(&session->s_cap_lock);
+ list_del_init(&cap->session_caps);
+ session->s_nr_caps--;
+ spin_unlock(&session->s_cap_lock);
+
+ /* remove from inode list */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ cap->session = NULL;
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
+ put_cap(cap, ctx);
+
+ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm_counter++;
+ ci->i_snap_realm = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ }
+ if (!__ceph_is_any_real_caps(ci))
+ __cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+ u64 ino, u64 cid, int op,
+ int caps, int wanted, int dirty,
+ u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+ u64 size, u64 max_size,
+ struct timespec *mtime, struct timespec *atime,
+ u64 time_warp_seq,
+ uid_t uid, gid_t gid, mode_t mode,
+ u64 xattr_version,
+ struct ceph_buffer *xattrs_buf,
+ u64 follows)
+{
+ struct ceph_mds_caps *fc;
+ struct ceph_msg *msg;
+
+ dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+ " seq %u/%u mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+ cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+ ceph_cap_string(dirty),
+ seq, issue_seq, mseq, follows, size, max_size,
+ xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ fc = msg->front.iov_base;
+
+ memset(fc, 0, sizeof(*fc));
+
+ fc->cap_id = cpu_to_le64(cid);
+ fc->op = cpu_to_le32(op);
+ fc->seq = cpu_to_le32(seq);
+ fc->client_tid = cpu_to_le64(flush_tid);
+ fc->issue_seq = cpu_to_le32(issue_seq);
+ fc->migrate_seq = cpu_to_le32(mseq);
+ fc->caps = cpu_to_le32(caps);
+ fc->wanted = cpu_to_le32(wanted);
+ fc->dirty = cpu_to_le32(dirty);
+ fc->ino = cpu_to_le64(ino);
+ fc->snap_follows = cpu_to_le64(follows);
+
+ fc->size = cpu_to_le64(size);
+ fc->max_size = cpu_to_le64(max_size);
+ if (mtime)
+ ceph_encode_timespec(&fc->mtime, mtime);
+ if (atime)
+ ceph_encode_timespec(&fc->atime, atime);
+ fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+ fc->uid = cpu_to_le32(uid);
+ fc->gid = cpu_to_le32(gid);
+ fc->mode = cpu_to_le32(mode);
+
+ fc->xattr_version = cpu_to_le64(xattr_version);
+ if (xattrs_buf) {
+ msg->middle = ceph_buffer_get(xattrs_buf);
+ fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+ msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+ }
+
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our
+ * cache.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+
+ spin_lock(&inode->i_lock);
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+ struct ceph_mds_session *session = cap->session;
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+
+ spin_lock(&session->s_cap_lock);
+ BUG_ON(!session->s_num_cap_releases);
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+
+ dout(" adding %p release to mds%d msg %p (%d left)\n",
+ inode, session->s_mds, msg, session->s_num_cap_releases);
+
+ BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(ceph_ino(inode));
+ item->cap_id = cpu_to_le64(cap->cap_id);
+ item->migrate_seq = cpu_to_le32(cap->mseq);
+ item->seq = cpu_to_le32(cap->issue_seq);
+
+ session->s_num_cap_releases--;
+
+ msg->front.iov_len += sizeof(*item);
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ dout(" release msg %p full\n", msg);
+ list_move_tail(&msg->list_head,
+ &session->s_cap_releases_done);
+ } else {
+ dout(" release msg %p at %d/%d (%d)\n", msg,
+ (int)le32_to_cpu(head->num),
+ (int)CEPH_CAPS_PER_RELEASE,
+ (int)msg->front.iov_len);
+ }
+ spin_unlock(&session->s_cap_lock);
+ p = rb_next(p);
+ __ceph_remove_cap(cap, NULL);
+
+ }
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Send a cap msg on the given inode. Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE. Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ int op, int used, int want, int retain, int flushing,
+ unsigned *pflush_tid)
+ __releases(cap->ci->vfs_inode->i_lock)
+{
+ struct ceph_inode_info *ci = cap->ci;
+ struct inode *inode = &ci->vfs_inode;
+ u64 cap_id = cap->cap_id;
+ int held = cap->issued | cap->implemented;
+ int revoking = cap->implemented & ~cap->issued;
+ int dropping = cap->issued & ~retain;
+ int keep;
+ u64 seq, issue_seq, mseq, time_warp_seq, follows;
+ u64 size, max_size;
+ struct timespec mtime, atime;
+ int wake = 0;
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+ struct ceph_mds_session *session;
+ u64 xattr_version = 0;
+ int delayed = 0;
+ u64 flush_tid = 0;
+ int i;
+ int ret;
+
+ dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+ inode, cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
+ BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+ session = cap->session;
+
+ /* don't release wanted unless we've waited a bit. */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_min)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ want |= cap->mds_wanted;
+ retain |= cap->issued;
+ delayed = 1;
+ }
+ ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+ cap->issued &= retain; /* drop bits we don't want */
+ if (cap->implemented & ~cap->issued) {
+ /*
+ * Wake up any waiters on wanted -> needed transition.
+ * This is due to the weird transition from buffered
+ * to sync IO... we need to flush dirty pages _before_
+ * allowing sync writes to avoid reordering.
+ */
+ wake = 1;
+ }
+ cap->implemented &= cap->issued | used;
+ cap->mds_wanted = want;
+
+ if (flushing) {
+ /*
+ * assign a tid for flush operations so we can avoid
+ * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+ * clean type races. track latest tid for every bit
+ * so we can handle flush AxFw, flush Fw, and have the
+ * first ack clean Ax.
+ */
+ flush_tid = ++ci->i_cap_flush_last_tid;
+ if (pflush_tid)
+ *pflush_tid = flush_tid;
+ dout(" cap_flush_tid %d\n", (int)flush_tid);
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if (flushing & (1 << i))
+ ci->i_cap_flush_tid[i] = flush_tid;
+ }
+
+ keep = cap->implemented;
+ seq = cap->seq;
+ issue_seq = cap->issue_seq;
+ mseq = cap->mseq;
+ size = inode->i_size;
+ ci->i_reported_size = size;
+ max_size = ci->i_wanted_max_size;
+ ci->i_requested_max_size = max_size;
+ mtime = inode->i_mtime;
+ atime = inode->i_atime;
+ time_warp_seq = ci->i_time_warp_seq;
+ follows = ci->i_snap_realm->cached_context->seq;
+ uid = inode->i_uid;
+ gid = inode->i_gid;
+ mode = inode->i_mode;
+
+ if (dropping & CEPH_CAP_XATTR_EXCL) {
+ __ceph_build_xattrs_blob(ci);
+ xattr_version = ci->i_xattrs.version + 1;
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ if (dropping & CEPH_CAP_FILE_CACHE) {
+ /* invalidate what we can */
+ dout("invalidating pages on %p\n", inode);
+ invalidate_mapping_pages(&inode->i_data, 0, -1);
+ }
+
+ ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+ op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+ size, max_size, &mtime, &atime, time_warp_seq,
+ uid, gid, mode,
+ xattr_version,
+ (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+ follows);
+ if (ret < 0) {
+ dout("error sending cap msg, must requeue %p\n", inode);
+ delayed = 1;
+ }
+
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+
+ return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken. This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock. Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int mds;
+ struct ceph_cap_snap *capsnap;
+ u32 mseq;
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+ session->s_mutex */
+ u64 next_follows = 0; /* keep track of how far we've gotten through the
+ i_cap_snaps list, and skip these entries next time
+ around to avoid an infinite loop */
+
+ if (psession)
+ session = *psession;
+
+ dout("__flush_snaps %p\n", inode);
+retry:
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ /* avoid an infiniute loop after retry */
+ if (capsnap->follows < next_follows)
+ continue;
+ /*
+ * we need to wait for sync writes to complete and for dirty
+ * pages to be written out.
+ */
+ if (capsnap->dirty_pages || capsnap->writing)
+ continue;
+
+ /* pick mds, take s_mutex */
+ mds = __ceph_get_cap_mds(ci, &mseq);
+ if (session && session->s_mds != mds) {
+ dout("oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&inode->i_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout("inverting session/ino locks on %p\n",
+ session);
+ mutex_lock(&session->s_mutex);
+ }
+ /*
+ * if session == NULL, we raced against a cap
+ * deletion. retry, and we'll get a better
+ * @mds value next time.
+ */
+ spin_lock(&inode->i_lock);
+ goto retry;
+ }
+
+ capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+ atomic_inc(&capsnap->nref);
+ if (!list_empty(&capsnap->flushing_item))
+ list_del_init(&capsnap->flushing_item);
+ list_add_tail(&capsnap->flushing_item,
+ &session->s_cap_snaps_flushing);
+ spin_unlock(&inode->i_lock);
+
+ dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+ inode, capsnap, next_follows, capsnap->size);
+ send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+ capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ 0, NULL,
+ capsnap->follows);
+
+ next_follows = capsnap->follows + 1;
+ ceph_put_cap_snap(capsnap);
+
+ spin_lock(&inode->i_lock);
+ goto retry;
+ }
+
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
+
+ if (psession)
+ *psession = session;
+ else if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&inode->i_lock);
+ __ceph_flush_snaps(ci, NULL);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty. If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+ struct inode *inode = &ci->vfs_inode;
+ int was = ci->i_dirty_caps;
+ int dirty = 0;
+
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ ceph_cap_string(mask), ceph_cap_string(was),
+ ceph_cap_string(was | mask));
+ ci->i_dirty_caps |= mask;
+ if (was == 0) {
+ dout(" inode %p now dirty\n", &ci->vfs_inode);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ igrab(inode);
+ dirty |= I_DIRTY_SYNC;
+ }
+ }
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+ (mask & CEPH_CAP_FILE_BUFFER))
+ dirty |= I_DIRTY_DATASYNC;
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ __cap_delay_requeue(mdsc, ci);
+}
+
+/*
+ * Add dirty inode to the flushing list. Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int flushing;
+
+ BUG_ON(ci->i_dirty_caps == 0);
+ BUG_ON(list_empty(&ci->i_dirty_item));
+
+ flushing = ci->i_dirty_caps;
+ dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
+ ci->i_flushing_caps |= flushing;
+ ci->i_dirty_caps = 0;
+ dout(" inode %p now !dirty\n", inode);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_del_init(&ci->i_dirty_item);
+
+ ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ mdsc->num_cap_flushing++;
+ dout(" inode %p now flushing seq %lld\n", inode,
+ ci->i_cap_flush_seq);
+ } else {
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ dout(" inode %p now flushing (more) seq %lld\n", inode,
+ ci->i_cap_flush_seq);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ return flushing;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps. Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ * cap release further.
+ * CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ * further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session)
+{
+ struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ int file_wanted, used;
+ int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
+ int drop_session_lock = session ? 0 : 1;
+ int want, retain, revoking, flushing = 0;
+ int mds = -1; /* keep track of how far we've gone through i_caps list
+ to avoid an infinite loop on retry */
+ struct rb_node *p;
+ int tried_invalidate = 0;
+ int delayed = 0, sent = 0, force_requeue = 0, num;
+ int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+ /* if we are unmounting, flush any unused caps immediately. */
+ if (mdsc->stopping)
+ is_delayed = 1;
+
+ spin_lock(&inode->i_lock);
+
+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ flags |= CHECK_CAPS_FLUSH;
+
+ /* flush snaps first time around only */
+ if (!list_empty(&ci->i_cap_snaps))
+ __ceph_flush_snaps(ci, &session);
+ goto retry_locked;
+retry:
+ spin_lock(&inode->i_lock);
+retry_locked:
+ file_wanted = __ceph_caps_file_wanted(ci);
+ used = __ceph_caps_used(ci);
+ want = file_wanted | used;
+
+ retain = want | CEPH_CAP_PIN;
+ if (!mdsc->stopping && inode->i_nlink > 0) {
+ if (want) {
+ retain |= CEPH_CAP_ANY; /* be greedy */
+ } else {
+ retain |= CEPH_CAP_ANY_SHARED;
+ /*
+ * keep RD only if we didn't have the file open RW,
+ * because then the mds would revoke it anyway to
+ * journal max_size=0.
+ */
+ if (ci->i_max_size == 0)
+ retain |= CEPH_CAP_ANY_RD;
+ }
+ }
+
+ dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+ " issued %s retain %s %s%s%s\n", inode,
+ ceph_cap_string(file_wanted),
+ ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(__ceph_caps_issued(ci, NULL)),
+ ceph_cap_string(retain),
+ (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+ (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+ /*
+ * If we no longer need to hold onto old our caps, and we may
+ * have cached pages, but don't want them, then try to invalidate.
+ * If we fail, it's because pages are locked.... try again later.
+ */
+ if ((!is_delayed || mdsc->stopping) &&
+ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ ci->i_rdcache_gen && /* may have cached pages */
+ file_wanted == 0 && /* no open files */
+ !ci->i_truncate_pending &&
+ !tried_invalidate) {
+ u32 invalidating_gen = ci->i_rdcache_gen;
+ int ret;
+
+ dout("check_caps trying to invalidate on %p\n", inode);
+ spin_unlock(&inode->i_lock);
+ ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
+ spin_lock(&inode->i_lock);
+ if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
+ /* success. */
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+ } else {
+ dout("check_caps failed to invalidate pages\n");
+ /* we failed to invalidate pages. check these
+ caps again later. */
+ force_requeue = 1;
+ __cap_set_timeouts(mdsc, ci);
+ }
+ tried_invalidate = 1;
+ goto retry_locked;
+ }
+
+ num = 0;
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ num++;
+
+ /* avoid looping forever */
+ if (mds >= cap->mds ||
+ ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+ continue;
+
+ /* NOTE: no side-effects allowed, until we take s_mutex */
+
+ revoking = cap->implemented & ~cap->issued;
+ if (revoking)
+ dout("mds%d revoking %s\n", cap->mds,
+ ceph_cap_string(revoking));
+
+ if (cap == ci->i_auth_cap &&
+ (cap->issued & CEPH_CAP_FILE_WR)) {
+ /* request larger max_size from MDS? */
+ if (ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size) {
+ dout("requesting new max_size\n");
+ goto ack;
+ }
+
+ /* approaching file_max? */
+ if ((inode->i_size << 1) >= ci->i_max_size &&
+ (ci->i_reported_size << 1) < ci->i_max_size) {
+ dout("i_size approaching max_size\n");
+ goto ack;
+ }
+ }
+ /* flush anything dirty? */
+ if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+ ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+
+ /* completed revocation? going down and there are no caps? */
+ if (revoking && (revoking & used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /* want more caps from mds? */
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+
+ /* things we might delay */
+ if ((cap->issued & ~retain) == 0 &&
+ cap->mds_wanted == want)
+ continue; /* nope, all good */
+
+ if (is_delayed)
+ goto ack;
+
+ /* delay? */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ delayed++;
+ continue;
+ }
+
+ack:
+ if (session && session != cap->session) {
+ dout("oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ session = NULL;
+ }
+ if (!session) {
+ session = cap->session;
+ if (mutex_trylock(&session->s_mutex) == 0) {
+ dout("inverting session/ino locks on %p\n",
+ session);
+ spin_unlock(&inode->i_lock);
+ if (took_snap_rwsem) {
+ up_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 0;
+ }
+ mutex_lock(&session->s_mutex);
+ goto retry;
+ }
+ }
+ /* take snap_rwsem after session mutex */
+ if (!took_snap_rwsem) {
+ if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+ dout("inverting snap/in locks on %p\n",
+ inode);
+ spin_unlock(&inode->i_lock);
+ down_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 1;
+ goto retry;
+ }
+ took_snap_rwsem = 1;
+ }
+
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+ flushing = __mark_caps_flushing(inode, session);
+
+ mds = cap->mds; /* remember mds, so we don't repeat */
+ sent++;
+
+ /* __send_cap drops i_lock */
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+ retain, flushing, NULL);
+ goto retry; /* retake i_lock and restart our cap scan. */
+ }
+
+ /*
+ * Reschedule delayed caps release if we delayed anything,
+ * otherwise cancel.
+ */
+ if (delayed && is_delayed)
+ force_requeue = 1; /* __send_cap delayed release; requeue */
+ if (!delayed && !is_delayed)
+ __cap_delay_cancel(mdsc, ci);
+ else if (!is_delayed || force_requeue)
+ __cap_delay_requeue(mdsc, ci);
+
+ spin_unlock(&inode->i_lock);
+
+ if (session && drop_session_lock)
+ mutex_unlock(&session->s_mutex);
+ if (took_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+ unsigned *flush_tid)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int unlock_session = session ? 0 : 1;
+ int flushing = 0;
+
+retry:
+ spin_lock(&inode->i_lock);
+ if (ci->i_dirty_caps && ci->i_auth_cap) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ int used = __ceph_caps_used(ci);
+ int want = __ceph_caps_wanted(ci);
+ int delayed;
+
+ if (!session) {
+ spin_unlock(&inode->i_lock);
+ session = cap->session;
+ mutex_lock(&session->s_mutex);
+ goto retry;
+ }
+ BUG_ON(session != cap->session);
+ if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+ goto out;
+
+ flushing = __mark_caps_flushing(inode, session);
+
+ /* __send_cap drops i_lock */
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+ cap->issued | cap->implemented, flushing,
+ flush_tid);
+ if (!delayed)
+ goto out_unlocked;
+
+ spin_lock(&inode->i_lock);
+ __cap_delay_requeue(mdsc, ci);
+ }
+out:
+ spin_unlock(&inode->i_lock);
+out_unlocked:
+ if (session && unlock_session)
+ mutex_unlock(&session->s_mutex);
+ return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int dirty, i, ret = 1;
+
+ spin_lock(&inode->i_lock);
+ dirty = __ceph_caps_dirty(ci);
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if ((ci->i_flushing_caps & (1 << i)) &&
+ ci->i_cap_flush_tid[i] <= tid) {
+ /* still flushing this bit */
+ ret = 0;
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+ req = list_entry(head->prev, struct ceph_osd_request,
+ r_unsafe_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+ dout("sync_write_wait on tid %llu (until %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ spin_lock(&ci->i_unsafe_lock);
+ ceph_osdc_put_request(req);
+
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_entry(head->next, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned flush_tid;
+ int ret;
+ int dirty;
+
+ dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+ sync_write_wait(inode);
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+ /*
+ * only wait on non-file metadata writeback (the mds
+ * can recover size and mtime, so we don't need to
+ * wait for that)
+ */
+ if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+ dout("fsync waiting for flush_tid %u\n", flush_tid);
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ }
+
+ dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+ return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds. If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, int wait)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned flush_tid;
+ int err = 0;
+ int dirty;
+
+ dout("write_inode %p wait=%d\n", inode, wait);
+ if (wait) {
+ dirty = try_flush_caps(inode, NULL, &flush_tid);
+ if (dirty)
+ err = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ } else {
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+ spin_lock(&inode->i_lock);
+ if (__ceph_caps_dirty(ci))
+ __cap_delay_requeue_front(mdsc, ci);
+ spin_unlock(&inode->i_lock);
+ }
+ return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_cap_snap *capsnap;
+
+ dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+ list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+ flushing_item) {
+ struct ceph_inode_info *ci = capsnap->ci;
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+
+ spin_lock(&inode->i_lock);
+ cap = ci->i_auth_cap;
+ if (cap && cap->session == session) {
+ dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+ cap, capsnap);
+ __ceph_flush_snaps(ci, &session);
+ } else {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);
+ spin_unlock(&inode->i_lock);
+ }
+ }
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+
+ kick_flushing_capsnaps(mdsc, session);
+
+ dout("kick_flushing_caps mds%d\n", session->s_mds);
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ int delayed = 0;
+
+ spin_lock(&inode->i_lock);
+ cap = ci->i_auth_cap;
+ if (cap && cap->session == session) {
+ dout("kick_flushing_caps %p cap %p %s\n", inode,
+ cap, ceph_cap_string(ci->i_flushing_caps));
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ ci->i_flushing_caps, NULL);
+ if (delayed) {
+ spin_lock(&inode->i_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&inode->i_lock);
+ }
+ } else {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);
+ spin_unlock(&inode->i_lock);
+ }
+ }
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+ if (got & CEPH_CAP_PIN)
+ ci->i_pin_ref++;
+ if (got & CEPH_CAP_FILE_RD)
+ ci->i_rd_ref++;
+ if (got & CEPH_CAP_FILE_CACHE)
+ ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_WR)
+ ci->i_wr_ref++;
+ if (got & CEPH_CAP_FILE_BUFFER) {
+ if (ci->i_wrbuffer_ref == 0)
+ igrab(&ci->vfs_inode);
+ ci->i_wrbuffer_ref++;
+ dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+ &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+ }
+}
+
+/*
+ * Try to grab cap references. Specify those refs we @want, and the
+ * minimal set we @need. Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+ int *got, loff_t endoff, int *check_max, int *err)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret = 0;
+ int have, implemented;
+
+ dout("get_cap_refs %p need %s want %s\n", inode,
+ ceph_cap_string(need), ceph_cap_string(want));
+ spin_lock(&inode->i_lock);
+
+ /* make sure we _have_ some caps! */
+ if (!__ceph_is_any_caps(ci)) {
+ dout("get_cap_refs %p no real caps\n", inode);
+ *err = -EBADF;
+ ret = 1;
+ goto out;
+ }
+
+ if (need & CEPH_CAP_FILE_WR) {
+ if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+ dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+ inode, endoff, ci->i_max_size);
+ if (endoff > ci->i_wanted_max_size) {
+ *check_max = 1;
+ ret = 1;
+ }
+ goto out;
+ }
+ /*
+ * If a sync write is in progress, we must wait, so that we
+ * can get a final snapshot value for size+mtime.
+ */
+ if (__ceph_have_pending_cap_snap(ci)) {
+ dout("get_cap_refs %p cap_snap_pending\n", inode);
+ goto out;
+ }
+ }
+ have = __ceph_caps_issued(ci, &implemented);
+
+ /*
+ * disallow writes while a truncate is pending
+ */
+ if (ci->i_truncate_pending)
+ have &= ~CEPH_CAP_FILE_WR;
+
+ if ((have & need) == need) {
+ /*
+ * Look at (implemented & ~have & not) so that we keep waiting
+ * on transition from wanted -> needed caps. This is needed
+ * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+ * going before a prior buffered writeback happens.
+ */
+ int not = want & ~(have & need);
+ int revoking = implemented & ~have;
+ dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+ inode, ceph_cap_string(have), ceph_cap_string(not),
+ ceph_cap_string(revoking));
+ if ((revoking & not) == 0) {
+ *got = need | (have & want);
+ __take_cap_refs(ci, *got);
+ ret = 1;
+ }
+ } else {
+ dout("get_cap_refs %p have %s needed %s\n", inode,
+ ceph_cap_string(have), ceph_cap_string(need));
+ }
+out:
+ spin_unlock(&inode->i_lock);
+ dout("get_cap_refs %p ret %d got %s\n", inode,
+ ret, ceph_cap_string(*got));
+ return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size. If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int check = 0;
+
+ /* do we need to explicitly request a larger max_size? */
+ spin_lock(&inode->i_lock);
+ if ((endoff >= ci->i_max_size ||
+ endoff > (inode->i_size << 1)) &&
+ endoff > ci->i_wanted_max_size) {
+ dout("write %p at large endoff %llu, req max_size\n",
+ inode, endoff);
+ ci->i_wanted_max_size = endoff;
+ check = 1;
+ }
+ spin_unlock(&inode->i_lock);
+ if (check)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references. If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+ loff_t endoff)
+{
+ int check_max, ret, err;
+
+retry:
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
+ check_max = 0;
+ err = 0;
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ try_get_cap_refs(ci, need, want,
+ got, endoff,
+ &check_max, &err));
+ if (err)
+ ret = err;
+ if (check_max)
+ goto retry;
+ return ret;
+}
+
+/*
+ * Take cap refs. Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+ spin_lock(&ci->vfs_inode.i_lock);
+ __take_cap_refs(ci, caps);
+ spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0, put = 0, flushsnaps = 0, wake = 0;
+ struct ceph_cap_snap *capsnap;
+
+ spin_lock(&inode->i_lock);
+ if (had & CEPH_CAP_PIN)
+ --ci->i_pin_ref;
+ if (had & CEPH_CAP_FILE_RD)
+ if (--ci->i_rd_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_CACHE)
+ if (--ci->i_rdcache_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_BUFFER) {
+ if (--ci->i_wrbuffer_ref == 0) {
+ last++;
+ put++;
+ }
+ dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+ inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+ }
+ if (had & CEPH_CAP_FILE_WR)
+ if (--ci->i_wr_ref == 0) {
+ last++;
+ if (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ if (capsnap->writing) {
+ capsnap->writing = 0;
+ flushsnaps =
+ __ceph_finish_cap_snap(ci,
+ capsnap);
+ wake = 1;
+ }
+ }
+ }
+ spin_unlock(&inode->i_lock);
+
+ dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+ last ? "last" : "");
+
+ if (last && !flushsnaps)
+ ceph_check_caps(ci, 0, NULL);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci);
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+ if (put)
+ iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context. Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS. If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0;
+ int last_snap = 0;
+ int found = 0;
+ struct ceph_cap_snap *capsnap = NULL;
+
+ spin_lock(&inode->i_lock);
+ ci->i_wrbuffer_ref -= nr;
+ last = !ci->i_wrbuffer_ref;
+
+ if (ci->i_head_snapc == snapc) {
+ ci->i_wrbuffer_ref_head -= nr;
+ if (!ci->i_wrbuffer_ref_head) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+ inode,
+ ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ last ? " LAST" : "");
+ } else {
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->context == snapc) {
+ found = 1;
+ capsnap->dirty_pages -= nr;
+ last_snap = !capsnap->dirty_pages;
+ break;
+ }
+ }
+ BUG_ON(!found);
+ dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+ " snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ last_snap ? " (capsnap last)" : "");
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ if (last) {
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ iput(inode);
+ } else if (last_snap) {
+ ceph_flush_snaps(ci);
+ wake_up(&ci->i_cap_wq);
+ }
+}
+
+/*
+ * Handle a cap GRANT message from the MDS. (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex.
+ * return value:
+ * 0 - ok
+ * 1 - check_caps on auth cap only (writeback)
+ * 2 - check_caps (ack revoke)
+ */
+static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap,
+ struct ceph_buffer *xattr_buf)
+ __releases(inode->i_lock)
+
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds = session->s_mds;
+ int seq = le32_to_cpu(grant->seq);
+ int newcaps = le32_to_cpu(grant->caps);
+ int issued, implemented, used, wanted, dirty;
+ u64 size = le64_to_cpu(grant->size);
+ u64 max_size = le64_to_cpu(grant->max_size);
+ struct timespec mtime, atime, ctime;
+ int reply = 0;
+ int wake = 0;
+ int writeback = 0;
+ int revoked_rdcache = 0;
+ int invalidate_async = 0;
+ int tried_invalidate = 0;
+ int ret;
+
+ dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+ inode, cap, mds, seq, ceph_cap_string(newcaps));
+ dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+ inode->i_size);
+
+ /*
+ * If CACHE is being revoked, and we have no dirty buffers,
+ * try to invalidate (once). (If there are dirty buffers, we
+ * will invalidate _after_ writeback.)
+ */
+restart:
+ if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ !ci->i_wrbuffer_ref && !tried_invalidate) {
+ dout("CACHE invalidation\n");
+ spin_unlock(&inode->i_lock);
+ tried_invalidate = 1;
+
+ ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
+ spin_lock(&inode->i_lock);
+ if (ret < 0) {
+ /* there were locked pages.. invalidate later
+ in a separate thread. */
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+ invalidate_async = 1;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
+ }
+ } else {
+ /* we successfully invalidated those pages */
+ revoked_rdcache = 1;
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+ }
+ goto restart;
+ }
+
+ /* side effects now are allowed */
+
+ issued = __ceph_caps_issued(ci, &implemented);
+ issued |= implemented | __ceph_caps_dirty(ci);
+
+ cap->cap_gen = session->s_cap_gen;
+
+ __check_cap_issue(ci, cap, newcaps);
+
+ if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = le32_to_cpu(grant->mode);
+ inode->i_uid = le32_to_cpu(grant->uid);
+ inode->i_gid = le32_to_cpu(grant->gid);
+ dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+ inode->i_uid, inode->i_gid);
+ }
+
+ if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ inode->i_nlink = le32_to_cpu(grant->nlink);
+
+ if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+ int len = le32_to_cpu(grant->xattr_len);
+ u64 version = le64_to_cpu(grant->xattr_version);
+
+ if (version > ci->i_xattrs.version) {
+ dout(" got new xattrs v%llu on %p len %d\n",
+ version, inode, len);
+ if (ci->i_xattrs.blob)
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+ ci->i_xattrs.version = version;
+ }
+ }
+
+ /* size/ctime/mtime/atime? */
+ ceph_fill_file_size(inode, issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size), size);
+ ceph_decode_timespec(&mtime, &grant->mtime);
+ ceph_decode_timespec(&atime, &grant->atime);
+ ceph_decode_timespec(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+ &atime);
+
+ /* max size increase? */
+ if (max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = 1;
+ }
+
+ /* check cap bits */
+ wanted = __ceph_caps_wanted(ci);
+ used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
+ dout(" my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted),
+ ceph_cap_string(used),
+ ceph_cap_string(dirty));
+ if (wanted != le32_to_cpu(grant->wanted)) {
+ dout("mds wanted %s -> %s\n",
+ ceph_cap_string(le32_to_cpu(grant->wanted)),
+ ceph_cap_string(wanted));
+ grant->wanted = cpu_to_le32(wanted);
+ }
+
+ cap->seq = seq;
+
+ /* file layout may have changed */
+ ci->i_layout = grant->layout;
+
+ /* revocation, grant, or no-op? */
+ if (cap->issued & ~newcaps) {
+ dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
+ if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+ writeback = 1; /* will delay ack */
+ else if (dirty & ~newcaps)
+ reply = 1; /* initiate writeback in check_caps */
+ else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+ revoked_rdcache)
+ reply = 2; /* send revoke ack in check_caps */
+ cap->issued = newcaps;
+ } else if (cap->issued == newcaps) {
+ dout("caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ } else {
+ dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
+ cap->issued = newcaps;
+ cap->implemented |= newcaps; /* add bits only, to
+ * avoid stepping on a
+ * pending revocation */
+ wake = 1;
+ }
+
+ spin_unlock(&inode->i_lock);
+ if (writeback) {
+ /*
+ * queue inode for writeback: we can't actually call
+ * filemap_write_and_wait, etc. from message handler
+ * context.
+ */
+ dout("queueing %p for writeback\n", inode);
+ if (ceph_queue_writeback(inode))
+ igrab(inode);
+ }
+ if (invalidate_async) {
+ dout("queueing %p for page invalidation\n", inode);
+ if (ceph_queue_page_invalidation(inode))
+ igrab(inode);
+ }
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+ return reply;
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+ __releases(inode->i_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ unsigned seq = le32_to_cpu(m->seq);
+ int dirty = le32_to_cpu(m->dirty);
+ int cleaned = 0;
+ u64 flush_tid = le64_to_cpu(m->client_tid);
+ int drop = 0;
+ int i;
+
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if ((dirty & (1 << i)) &&
+ flush_tid == ci->i_cap_flush_tid[i])
+ cleaned |= 1 << i;
+
+ dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+ " flushing %s -> %s\n",
+ inode, session->s_mds, seq, ceph_cap_string(dirty),
+ ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+ if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ goto out;
+
+ ci->i_flushing_caps &= ~cleaned;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing))
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_entry(session->s_cap_flushing.next,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ mdsc->num_cap_flushing--;
+ wake_up(&mdsc->cap_flushing_wq);
+ dout(" inode %p now !flushing\n", inode);
+
+ if (ci->i_dirty_caps == 0) {
+ dout(" inode %p now clean\n", inode);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ drop = 1;
+ } else {
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ }
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ wake_up(&ci->i_cap_wq);
+
+out:
+ spin_unlock(&inode->i_lock);
+ if (drop)
+ iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 follows = le64_to_cpu(m->snap_follows);
+ u64 flush_tid = le64_to_cpu(m->client_tid);
+ struct ceph_cap_snap *capsnap;
+ int drop = 0;
+
+ dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+ inode, ci, session->s_mds, follows);
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->follows == follows) {
+ if (capsnap->flush_tid != flush_tid) {
+ dout(" cap_snap %p follows %lld tid %lld !="
+ " %lld\n", capsnap, follows,
+ flush_tid, capsnap->flush_tid);
+ break;
+ }
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing cap_snap %p follows %lld\n",
+ capsnap, follows);
+ ceph_put_snap_context(capsnap->context);
+ list_del(&capsnap->ci_item);
+ list_del(&capsnap->flushing_item);
+ ceph_put_cap_snap(capsnap);
+ drop = 1;
+ break;
+ } else {
+ dout(" skipping cap_snap %p follows %lld\n",
+ capsnap, capsnap->follows);
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ if (drop)
+ iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+ struct ceph_mds_caps *trunc,
+ struct ceph_mds_session *session)
+ __releases(inode->i_lock)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds = session->s_mds;
+ int seq = le32_to_cpu(trunc->seq);
+ u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+ u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+ u64 size = le64_to_cpu(trunc->size);
+ int implemented = 0;
+ int dirty = __ceph_caps_dirty(ci);
+ int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+ int queue_trunc = 0;
+
+ issued |= implemented | dirty;
+
+ dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+ inode, mds, seq, truncate_size, truncate_seq);
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ truncate_seq, truncate_size, size);
+ spin_unlock(&inode->i_lock);
+
+ if (queue_trunc)
+ if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+ &ci->i_vmtruncate_work))
+ igrab(inode);
+}
+
+/*
+ * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
+ * different one. If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds = session->s_mds;
+ unsigned mseq = le32_to_cpu(ex->migrate_seq);
+ struct ceph_cap *cap = NULL, *t;
+ struct rb_node *p;
+ int remember = 1;
+
+ dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+ inode, ci, mds, mseq);
+
+ spin_lock(&inode->i_lock);
+
+ /* make sure we haven't seen a higher mseq */
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ t = rb_entry(p, struct ceph_cap, ci_node);
+ if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+ dout(" higher mseq on cap from mds%d\n",
+ t->session->s_mds);
+ remember = 0;
+ }
+ if (t->session->s_mds == mds)
+ cap = t;
+ }
+
+ if (cap) {
+ if (remember) {
+ /* make note */
+ ci->i_cap_exporting_mds = mds;
+ ci->i_cap_exporting_mseq = mseq;
+ ci->i_cap_exporting_issued = cap->issued;
+ }
+ __ceph_remove_cap(cap, NULL);
+ } else {
+ WARN_ON(!cap);
+ }
+
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT. If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_session *session,
+ void *snaptrace, int snaptrace_len)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int mds = session->s_mds;
+ unsigned issued = le32_to_cpu(im->caps);
+ unsigned wanted = le32_to_cpu(im->wanted);
+ unsigned seq = le32_to_cpu(im->seq);
+ unsigned mseq = le32_to_cpu(im->migrate_seq);
+ u64 realmino = le64_to_cpu(im->realm);
+ u64 cap_id = le64_to_cpu(im->cap_id);
+
+ if (ci->i_cap_exporting_mds >= 0 &&
+ ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+ " - cleared exporting from mds%d\n",
+ inode, ci, mds, mseq,
+ ci->i_cap_exporting_mds);
+ ci->i_cap_exporting_issued = 0;
+ ci->i_cap_exporting_mseq = 0;
+ ci->i_cap_exporting_mds = -1;
+ } else {
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+ inode, ci, mds, mseq);
+ }
+
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+ false);
+ downgrade_write(&mdsc->snap_rwsem);
+ ceph_add_cap(inode, session, cap_id, -1,
+ issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+ NULL /* no caps context */);
+ try_flush_caps(inode, session, NULL);
+ up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct super_block *sb = mdsc->client->sb;
+ struct inode *inode;
+ struct ceph_cap *cap;
+ struct ceph_mds_caps *h;
+ int mds = le64_to_cpu(msg->hdr.src.name.num);
+ int op;
+ u32 seq;
+ struct ceph_vino vino;
+ u64 cap_id;
+ u64 size, max_size;
+ int check_caps = 0;
+ int r;
+
+ dout("handle_caps from mds%d\n", mds);
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ h = msg->front.iov_base;
+ op = le32_to_cpu(h->op);
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ cap_id = le64_to_cpu(h->cap_id);
+ seq = le32_to_cpu(h->seq);
+ size = le64_to_cpu(h->size);
+ max_size = le64_to_cpu(h->max_size);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+ (unsigned)seq);
+
+ /* lookup ino */
+ inode = ceph_find_inode(sb, vino);
+ dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+ vino.snap, inode);
+ if (!inode) {
+ dout(" i don't have ino %llx\n", vino.ino);
+ goto done;
+ }
+
+ /* these will work even if we don't have a cap yet */
+ switch (op) {
+ case CEPH_CAP_OP_FLUSHSNAP_ACK:
+ handle_cap_flushsnap_ack(inode, h, session);
+ goto done;
+
+ case CEPH_CAP_OP_EXPORT:
+ handle_cap_export(inode, h, session);
+ goto done;
+
+ case CEPH_CAP_OP_IMPORT:
+ handle_cap_import(mdsc, inode, h, session,
+ msg->middle,
+ le32_to_cpu(h->snap_trace_len));
+ check_caps = 1; /* we may have sent a RELEASE to the old auth */
+ goto done;
+ }
+
+ /* the rest require a cap */
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ceph_inode(inode), mds);
+ if (!cap) {
+ dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+ inode, ceph_ino(inode), ceph_snap(inode), mds);
+ spin_unlock(&inode->i_lock);
+ goto done;
+ }
+
+ /* note that each of these drops i_lock for us */
+ switch (op) {
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
+ r = handle_cap_grant(inode, h, session, cap, msg->middle);
+ if (r == 1)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ session);
+ else if (r == 2)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_NODELAY,
+ session);
+ break;
+
+ case CEPH_CAP_OP_FLUSH_ACK:
+ handle_cap_flush_ack(inode, h, session, cap);
+ break;
+
+ case CEPH_CAP_OP_TRUNC:
+ handle_cap_trunc(inode, h, session);
+ break;
+
+ default:
+ spin_unlock(&inode->i_lock);
+ pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
+ }
+
+done:
+ mutex_unlock(&session->s_mutex);
+
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+ if (inode)
+ iput(inode);
+ return;
+
+bad:
+ pr_err("ceph_handle_caps: corrupt message\n");
+ return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ int flags = CHECK_CAPS_NODELAY;
+
+ dout("check_delayed_caps\n");
+ while (1) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (list_empty(&mdsc->cap_delay_list))
+ break;
+ ci = list_first_entry(&mdsc->cap_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max))
+ break;
+ list_del_init(&ci->i_cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+ dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+ ceph_check_caps(ci, flags, NULL);
+ }
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ dout("flush_dirty_caps\n");
+ spin_lock(&mdsc->cap_dirty_lock);
+ while (!list_empty(&mdsc->cap_dirty)) {
+ ci = list_first_entry(&mdsc->cap_dirty,
+ struct ceph_inode_info,
+ i_dirty_item);
+ inode = igrab(&ci->vfs_inode);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (inode) {
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+ NULL);
+ iput(inode);
+ }
+ spin_lock(&mdsc->cap_dirty_lock);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Drop open file reference. If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0;
+
+ spin_lock(&inode->i_lock);
+ dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+ ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+ BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+ if (--ci->i_nr_by_mode[fmode] == 0)
+ last++;
+ spin_unlock(&inode->i_lock);
+
+ if (last && ci->i_vino.snap == CEPH_NOSNAP)
+ ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+ int mds, int drop, int unless, int force)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap;
+ struct ceph_mds_request_release *rel = *p;
+ int ret = 0;
+
+ dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
+ mds, ceph_cap_string(drop), ceph_cap_string(unless));
+
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap && __cap_is_valid(cap)) {
+ if (force ||
+ ((cap->issued & drop) &&
+ (cap->issued & unless) == 0)) {
+ if ((cap->issued & drop) &&
+ (cap->issued & unless) == 0) {
+ dout("encode_inode_release %p cap %p %s -> "
+ "%s\n", inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop));
+ cap->issued &= ~drop;
+ cap->implemented &= ~drop;
+ if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+ int wanted = __ceph_caps_wanted(ci);
+ dout(" wanted %s -> %s (act %s)\n",
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(cap->mds_wanted &
+ ~wanted),
+ ceph_cap_string(wanted));
+ cap->mds_wanted &= wanted;
+ }
+ } else {
+ dout("encode_inode_release %p cap %p %s"
+ " (force)\n", inode, cap,
+ ceph_cap_string(cap->issued));
+ }
+
+ rel->ino = cpu_to_le64(ceph_ino(inode));
+ rel->cap_id = cpu_to_le64(cap->cap_id);
+ rel->seq = cpu_to_le32(cap->seq);
+ rel->issue_seq = cpu_to_le32(cap->issue_seq),
+ rel->mseq = cpu_to_le32(cap->mseq);
+ rel->caps = cpu_to_le32(cap->issued);
+ rel->wanted = cpu_to_le32(cap->mds_wanted);
+ rel->dname_len = 0;
+ rel->dname_seq = 0;
+ *p += sizeof(*rel);
+ ret = 1;
+ } else {
+ dout("encode_inode_release %p cap %p %s\n",
+ inode, cap, ceph_cap_string(cap->issued));
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+ int mds, int drop, int unless)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct ceph_mds_request_release *rel = *p;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int force = 0;
+ int ret;
+
+ /*
+ * force an record for the directory caps if we have a dentry lease.
+ * this is racy (can't take i_lock and d_lock together), but it
+ * doesn't have to be perfect; the mds will revoke anything we don't
+ * release.
+ */
+ spin_lock(&dentry->d_lock);
+ if (di->lease_session && di->lease_session->s_mds == mds)
+ force = 1;
+ spin_unlock(&dentry->d_lock);
+
+ ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+ spin_lock(&dentry->d_lock);
+ if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ dout("encode_dentry_release %p mds%d seq %d\n",
+ dentry, mds, (int)di->lease_seq);
+ rel->dname_len = cpu_to_le32(dentry->d_name.len);
+ memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+ *p += dentry->d_name.len;
+ rel->dname_seq = cpu_to_le32(di->lease_seq);
+ }
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+# define dout(fmt, ...) \
+ pr_debug(" %12.12s:%-4d : " fmt, \
+ ceph_file_part(__FILE__, sizeof(__FILE__)), \
+ __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+# define dout(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+ unsigned va = ceph_frag_value(a);
+ unsigned vb = ceph_frag_value(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ va = ceph_frag_bits(a);
+ vb = ceph_frag_bits(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+ /* is sub as specific as us, and contained by us? */
+ return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+ (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f) - 1,
+ ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1,
+ ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ __u32 os = le32_to_cpu(layout->fl_object_size);
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;
+ return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ return CEPH_FILE_MODE_LAZY;
+#endif
+ if ((flags & O_APPEND) == O_APPEND)
+ flags |= O_WRONLY;
+
+ flags &= O_ACCMODE;
+ if ((flags & O_RDWR) == O_RDWR)
+ return CEPH_FILE_MODE_RDWR;
+ if ((flags & O_WRONLY) == O_WRONLY)
+ return CEPH_FILE_MODE_WR;
+ return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+ switch (mode) {
+ case CEPH_FILE_MODE_PIN:
+ return CEPH_CAP_PIN;
+ case CEPH_FILE_MODE_RD:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ case CEPH_FILE_MODE_RDWR:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ case CEPH_FILE_MODE_WR:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ }
+ return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..4e5f49c738d8
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,647 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 17
+#define CEPH_VERSION_PATCH 0
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+ "." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+ CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL 7 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
+#define CEPH_MON_PROTOCOL 5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL 22 /* server/client */
+#define CEPH_MDSC_PROTOCOL 30 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le64 tid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 tid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+ __le64 have_version; __le64 have;
+ __u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN 1
+#define CEPH_LOCK_ISNAP 2
+#define CEPH_LOCK_IVERSION 4 /* mds internal */
+#define CEPH_LOCK_IFILE 8 /* mds internal */
+#define CEPH_LOCK_IAUTH 32
+#define CEPH_LOCK_ILINK 64
+#define CEPH_LOCK_IDFT 128 /* dir frag tree */
+#define CEPH_LOCK_INEST 256 /* mds internal */
+#define CEPH_LOCK_IXATTR 512
+#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x00301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE 1
+#define CEPH_SETATTR_UID 2
+#define CEPH_SETATTR_GID 4
+#define CEPH_SETATTR_MTIME 8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE 32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 file_replication;
+ __le32 preferred;
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+
+struct ceph_mds_request_head {
+ __le64 tid, oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le64 tid;
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+ __le64 ino;
+ __le64 snapid;
+ __le32 rdev;
+ __le64 version; /* inode version */
+ __le64 xattr_version; /* version for xattr blob */
+ struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+ struct ceph_file_layout layout;
+ struct ceph_timespec ctime, mtime, atime;
+ __le32 time_warp_seq;
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ __le32 mode, uid, gid;
+ __le32 nlink;
+ __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
+ struct ceph_timespec rctime;
+ struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS 16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+ __le64 client_tid; /* for FLUSH(SNAP) -> FLUSH(SNAP)_ACK */
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+
+ /* filelock */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout layout;
+ __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..1c44e43fe89e
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c) \
+ do { \
+ a = a - b; a = a - c; a = a ^ (c >> 13); \
+ b = b - c; b = b - a; b = b ^ (a << 8); \
+ c = c - a; c = c - b; c = c ^ (b >> 13); \
+ a = a - b; a = a - c; a = a ^ (c >> 12); \
+ b = b - c; b = b - a; b = b ^ (a << 16); \
+ c = c - a; c = c - b; c = c ^ (b >> 5); \
+ a = a - b; a = a - c; a = a ^ (c >> 3); \
+ b = b - c; b = b - a; b = b ^ (a << 10); \
+ c = c - a; c = c - b; c = c ^ (b >> 15); \
+ } while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+ const unsigned char *k = (const unsigned char *)str;
+ __u32 a, b, c; /* the internal state */
+ __u32 len; /* how many key bytes still need mixing */
+
+ /* Set up the internal state */
+ len = length;
+ a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ b = a;
+ c = 0; /* variable initialization of internal state */
+
+ /* handle most of the key */
+ while (len >= 12) {
+ a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+ ((__u32)k[3] << 24));
+ b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+ ((__u32)k[7] << 24));
+ c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+ ((__u32)k[11] << 24));
+ mix(a, b, c);
+ k = k + 12;
+ len = len - 12;
+ }
+
+ /* handle the last 11 bytes */
+ c = c + length;
+ switch (len) { /* all the case statements fall through */
+ case 11:
+ c = c + ((__u32)k[10] << 24);
+ case 10:
+ c = c + ((__u32)k[9] << 16);
+ case 9:
+ c = c + ((__u32)k[8] << 8);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b = b + ((__u32)k[7] << 24);
+ case 7:
+ b = b + ((__u32)k[6] << 16);
+ case 6:
+ b = b + ((__u32)k[5] << 8);
+ case 5:
+ b = b + k[4];
+ case 4:
+ a = a + ((__u32)k[3] << 24);
+ case 3:
+ a = a + ((__u32)k[2] << 16);
+ case 2:
+ a = a + ((__u32)k[1] << 8);
+ case 1:
+ a = a + k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a, b, c);
+
+ return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+ unsigned long hash = 0;
+ unsigned char c;
+
+ while (length--) {
+ c = *str++;
+ hash = (hash + (c << 4) + (c >> 4)) * 11;
+ }
+ return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return ceph_str_hash_linux(s, len);
+ case CEPH_STR_HASH_RJENKINS:
+ return ceph_str_hash_rjenkins(s, len);
+ default:
+ return -1;
+ }
+}
+
+const char *ceph_str_hash_name(int type)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return "linux";
+ case CEPH_STR_HASH_RJENKINS:
+ return "rjenkins";
+ default:
+ return "unknown";
+ }
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_entity_type_name(int type)
+{
+ switch (type) {
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+ case CEPH_ENTITY_TYPE_AUTH: return "auth";
+ default: return "unknown";
+ }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+ case CEPH_OSD_OP_READ: return "read";
+ case CEPH_OSD_OP_STAT: return "stat";
+
+ case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+ case CEPH_OSD_OP_WRITE: return "write";
+ case CEPH_OSD_OP_DELETE: return "delete";
+ case CEPH_OSD_OP_TRUNCATE: return "truncate";
+ case CEPH_OSD_OP_ZERO: return "zero";
+ case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+ case CEPH_OSD_OP_APPEND: return "append";
+ case CEPH_OSD_OP_STARTSYNC: return "startsync";
+ case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+ case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+ case CEPH_OSD_OP_TMAPUP: return "tmapup";
+ case CEPH_OSD_OP_TMAPGET: return "tmapget";
+ case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+ case CEPH_OSD_OP_GETXATTR: return "getxattr";
+ case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+ case CEPH_OSD_OP_SETXATTR: return "setxattr";
+ case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+ case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+ case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+ case CEPH_OSD_OP_PULL: return "pull";
+ case CEPH_OSD_OP_PUSH: return "push";
+ case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+ case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+ case CEPH_OSD_OP_SCRUB: return "scrub";
+
+ case CEPH_OSD_OP_WRLOCK: return "wrlock";
+ case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+ case CEPH_OSD_OP_RDLOCK: return "rdlock";
+ case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+ case CEPH_OSD_OP_UPLOCK: return "uplock";
+ case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+ case CEPH_OSD_OP_CALL: return "call";
+
+ case CEPH_OSD_OP_PGLS: return "pgls";
+ }
+ return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+ switch (s) {
+ /* down and out */
+ case CEPH_MDS_STATE_DNE: return "down:dne";
+ case CEPH_MDS_STATE_STOPPED: return "down:stopped";
+ /* up and out */
+ case CEPH_MDS_STATE_BOOT: return "up:boot";
+ case CEPH_MDS_STATE_STANDBY: return "up:standby";
+ case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_CREATING: return "up:creating";
+ case CEPH_MDS_STATE_STARTING: return "up:starting";
+ /* up and in */
+ case CEPH_MDS_STATE_REPLAY: return "up:replay";
+ case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
+ case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
+ case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
+ case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+ case CEPH_MDS_STATE_ACTIVE: return "up:active";
+ case CEPH_MDS_STATE_STOPPING: return "up:stopping";
+ }
+ return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+ switch (op) {
+ case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+ case CEPH_SESSION_OPEN: return "open";
+ case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+ case CEPH_SESSION_CLOSE: return "close";
+ case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+ case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+ case CEPH_SESSION_STALE: return "stale";
+ case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ }
+ return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+ switch (op) {
+ case CEPH_MDS_OP_LOOKUP: return "lookup";
+ case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
+ case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
+ case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_SETXATTR: return "setxattr";
+ case CEPH_MDS_OP_SETATTR: return "setattr";
+ case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+ case CEPH_MDS_OP_READDIR: return "readdir";
+ case CEPH_MDS_OP_MKNOD: return "mknod";
+ case CEPH_MDS_OP_LINK: return "link";
+ case CEPH_MDS_OP_UNLINK: return "unlink";
+ case CEPH_MDS_OP_RENAME: return "rename";
+ case CEPH_MDS_OP_MKDIR: return "mkdir";
+ case CEPH_MDS_OP_RMDIR: return "rmdir";
+ case CEPH_MDS_OP_SYMLINK: return "symlink";
+ case CEPH_MDS_OP_CREATE: return "create";
+ case CEPH_MDS_OP_OPEN: return "open";
+ case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+ case CEPH_MDS_OP_LSSNAP: return "lssnap";
+ case CEPH_MDS_OP_MKSNAP: return "mksnap";
+ case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+ }
+ return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+ switch (op) {
+ case CEPH_CAP_OP_GRANT: return "grant";
+ case CEPH_CAP_OP_REVOKE: return "revoke";
+ case CEPH_CAP_OP_TRUNC: return "trunc";
+ case CEPH_CAP_OP_EXPORT: return "export";
+ case CEPH_CAP_OP_IMPORT: return "import";
+ case CEPH_CAP_OP_UPDATE: return "update";
+ case CEPH_CAP_OP_DROP: return "drop";
+ case CEPH_CAP_OP_FLUSH: return "flush";
+ case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+ case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+ case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+ case CEPH_CAP_OP_RELEASE: return "release";
+ case CEPH_CAP_OP_RENEW: return "renew";
+ }
+ return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+ switch (o) {
+ case CEPH_MDS_LEASE_REVOKE: return "revoke";
+ case CEPH_MDS_LEASE_RELEASE: return "release";
+ case CEPH_MDS_LEASE_RENEW: return "renew";
+ case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+ }
+ return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+ switch (o) {
+ case CEPH_SNAP_OP_UPDATE: return "update";
+ case CEPH_SNAP_OP_CREATE: return "create";
+ case CEPH_SNAP_OP_DESTROY: return "destroy";
+ case CEPH_SNAP_OP_SPLIT: return "split";
+ }
+ return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";
+ default: return "unknown";
+ }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+ if (p >= b->size)
+ return 0;
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ if (p & 1)
+ return ((struct crush_bucket_tree *)b)->node_weights[p];
+ return 0;
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];
+ }
+ return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+ int i, b, c;
+
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ for (i = 0; i < map->buckets[b]->size; i++) {
+ c = map->buckets[b]->items[i];
+ BUG_ON(c >= map->max_devices ||
+ c < -map->max_buckets);
+ if (c >= 0)
+ map->device_parents[c] = map->buckets[b]->id;
+ else
+ map->bucket_parents[-1-c] = map->buckets[b]->id;
+ }
+ }
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ int b;
+
+ /* buckets */
+ if (map->buckets) {
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);
+ }
+
+ /* rules */
+ if (map->rules) {
+ for (b = 0; b < map->max_rules; b++)
+ kfree(map->rules[b]);
+ kfree(map->rules);
+ }
+
+ kfree(map->bucket_parents);
+ kfree(map->device_parents);
+ kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
+#define CRUSH_MAX_SET 10 /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices. A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+ __u32 op;
+ __s32 arg1;
+ __s32 arg2;
+};
+
+/* step op codes */
+enum {
+ CRUSH_RULE_NOOP = 0,
+ CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
+ CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+ /* arg2 = type */
+ CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
+ CRUSH_RULE_EMIT = 4, /* no args */
+ CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+ CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N 0
+#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+ __u8 ruleset;
+ __u8 type;
+ __u8 min_size;
+ __u8 max_size;
+};
+
+struct crush_rule {
+ __u32 len;
+ struct crush_rule_mask mask;
+ struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+ (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets). Items within a bucket are chosen using one of a
+ * few different algorithms. The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ * Bucket Alg Speed Additions Removals
+ * ------------------------------------------------
+ * uniform O(1) poor poor
+ * list O(n) optimal poor
+ * tree O(log n) good good
+ * straw O(n) optimal optimal
+ */
+enum {
+ CRUSH_BUCKET_UNIFORM = 1,
+ CRUSH_BUCKET_LIST = 2,
+ CRUSH_BUCKET_TREE = 3,
+ CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+ __s32 id; /* this'll be negative */
+ __u16 type; /* non-zero; type=0 is reserved for devices */
+ __u8 alg; /* one of CRUSH_BUCKET_* */
+ __u8 hash; /* which hash function to use, CRUSH_HASH_* */
+ __u32 weight; /* 16-bit fixed point */
+ __u32 size; /* num items */
+ __s32 *items;
+
+ /*
+ * cached random permutation: used for uniform bucket and for
+ * the linear search fallback for the other bucket types.
+ */
+ __u32 perm_x; /* @x for which *perm is defined */
+ __u32 perm_n; /* num elements of *perm that are permuted/defined */
+ __u32 *perm;
+};
+
+struct crush_bucket_uniform {
+ struct crush_bucket h;
+ __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+ struct crush_bucket h;
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *sum_weights; /* 16-bit fixed point. element i is sum
+ of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+ struct crush_bucket h; /* note: h.size is _tree_ size, not number of
+ actual items */
+ __u8 num_nodes;
+ __u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+ struct crush_bucket h;
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *straws; /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+ struct crush_bucket **buckets;
+ struct crush_rule **rules;
+
+ /*
+ * Parent pointers to identify the parent bucket a device or
+ * bucket in the hierarchy. If an item appears more than
+ * once, this is the _last_ time it appeared (where buckets
+ * are processed in bucket id order, from -1 on down to
+ * -max_buckets.
+ */
+ __u32 *bucket_parents;
+ __u32 *device_parents;
+
+ __s32 max_buckets;
+ __u32 max_rules;
+ __s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include "hash.h"
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);
+ return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:
+ return 0;
+ }
+}
+
+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";
+ default:
+ return "unknown";
+ }
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1 0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e);
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..2523d448445c
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+# define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+ int i;
+
+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+ int x, int r)
+{
+ unsigned pr = r % bucket->size;
+ unsigned i, s;
+
+ /* start a new permutation if @x has changed */
+ if (bucket->perm_x != x || bucket->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ bucket->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ bucket->perm[0] = s;
+ bucket->perm_n = 0xffff; /* magic value, see below */
+ goto out;
+ }
+
+ for (i = 0; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm_n = 0;
+ } else if (bucket->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm[bucket->perm[0]] = 0;
+ bucket->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < bucket->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+ while (bucket->perm_n <= pr) {
+ unsigned p = bucket->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned t = bucket->perm[p + i];
+ bucket->perm[p + i] = bucket->perm[p];
+ bucket->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ bucket->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+
+ s = bucket->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+ int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+ int x, int r)
+{
+ int i;
+
+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i])
+ return bucket->h.items[i];
+ }
+
+ BUG_ON(1);
+ return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n, l;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ int i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+ dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+ x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose((struct crush_bucket_straw *)in,
+ x, r);
+ default:
+ BUG_ON(1);
+ return in->items[0];
+ }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+ if (weight[item] >= 0x1000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;
+ return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+ struct crush_bucket *bucket,
+ __u32 *weight,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int firstn, int recurse_to_leaf,
+ int *out2)
+{
+ int rep;
+ int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ const int orig_tries = 5; /* attempts before we fall back to search */
+ dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+ for (rep = outpos; rep < numrep; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep;
+ if (in->alg == CRUSH_BUCKET_UNIFORM) {
+ /* be careful */
+ if (firstn || numrep >= in->size)
+ /* r' = r + f_total */
+ r += ftotal;
+ else if (in->size % numrep == 0)
+ /* r'=r+(n+1)*f_local */
+ r += (numrep+1) *
+ (flocal+ftotal);
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ } else {
+ if (firstn)
+ /* r' = r + f_total */
+ r += ftotal;
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ }
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (flocal >= (in->size>>1) &&
+ flocal > orig_tries)
+ item = bucket_perm_choose(in, x, r);
+ else
+ item = crush_bucket_choose(in, x, r);
+ BUG_ON(item >= map->max_devices);
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ BUG_ON(item >= 0 ||
+ (-1-item) >= map->max_buckets);
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ if (recurse_to_leaf &&
+ item < 0 &&
+ crush_choose(map, map->buckets[-1-item],
+ weight,
+ x, outpos+1, 0,
+ out2, outpos,
+ firstn, 0, NULL) <= outpos) {
+ reject = 1;
+ } else {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ item, x);
+ else
+ reject = 0;
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal < 3)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (flocal < in->size + orig_tries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < 20)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %d flocal %d\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("choose got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ }
+
+ dprintk("choose returns %d\n", outpos);
+ return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ int force, __u32 *weight)
+{
+ int result_len;
+ int force_context[CRUSH_MAX_DEPTH];
+ int force_pos = -1;
+ int a[CRUSH_MAX_SET];
+ int b[CRUSH_MAX_SET];
+ int c[CRUSH_MAX_SET];
+ int recurse_to_leaf;
+ int *w;
+ int wsize = 0;
+ int *o;
+ int osize;
+ int *tmp;
+ struct crush_rule *rule;
+ int step;
+ int i, j;
+ int numrep;
+ int firstn;
+ int rc = -1;
+
+ BUG_ON(ruleno >= map->max_rules);
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+ w = a;
+ o = b;
+
+ /*
+ * determine hierarchical context of force, if any. note
+ * that this may or may not correspond to the specific types
+ * referenced by the crush rule.
+ */
+ if (force >= 0) {
+ if (force >= map->max_devices ||
+ map->device_parents[force] == 0) {
+ /*dprintk("CRUSH: forcefed device dne\n");*/
+ rc = -1; /* force fed device dne */
+ goto out;
+ }
+ if (!is_out(map, weight, force, x)) {
+ while (1) {
+ force_context[++force_pos] = force;
+ if (force >= 0)
+ force = map->device_parents[force];
+ else
+ force = map->bucket_parents[-1-force];
+ if (force == 0)
+ break;
+ }
+ }
+ }
+
+ for (step = 0; step < rule->len; step++) {
+ firstn = 0;
+ switch (rule->steps[step].op) {
+ case CRUSH_RULE_TAKE:
+ w[0] = rule->steps[step].arg1;
+ if (force_pos >= 0) {
+ BUG_ON(force_context[force_pos] != w[0]);
+ force_pos--;
+ }
+ wsize = 1;
+ break;
+
+ case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ BUG_ON(wsize == 0);
+
+ recurse_to_leaf =
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ /*
+ * see CRUSH_N, CRUSH_N_MINUS macros.
+ * basically, numrep <= 0 means relative to
+ * the provided result_max
+ */
+ numrep = rule->steps[step].arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ if (osize == 0 && force_pos >= 0) {
+ /* skip any intermediate types */
+ while (force_pos &&
+ force_context[force_pos] < 0 &&
+ rule->steps[step].arg2 !=
+ map->buckets[-1 -
+ force_context[force_pos]]->type)
+ force_pos--;
+ o[osize] = force_context[force_pos];
+ if (recurse_to_leaf)
+ c[osize] = force_context[0];
+ j++;
+ force_pos--;
+ }
+ osize += crush_choose(map,
+ map->buckets[-1-w[i]],
+ weight,
+ x, numrep,
+ rule->steps[step].arg2,
+ o+osize, j,
+ firstn,
+ recurse_to_leaf, c+osize);
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap t and w arrays */
+ tmp = o;
+ o = w;
+ w = tmp;
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;
+ break;
+
+ default:
+ BUG_ON(1);
+ }
+ }
+ rc = result_len;
+
+out:
+ return rc;
+}
+
+
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+ int ruleno,
+ int x, int *result, int result_max,
+ int forcefeed, /* -1 for none */
+ __u32 *weights);
+
+#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..b90fc3e1ff70
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,450 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client* - an instance of the ceph client
+ * .../osdmap - current osdmap
+ * .../mdsmap - current mdsmap
+ * .../monmap - current monmap
+ * .../osdc - active osd requests
+ * .../mdsc - active mds requests
+ * .../monc - mon client state
+ * .../dentry_lru - dump contents of dentry lru
+ * .../caps - expose cap (reservation) stats
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->monc.monmap == NULL)
+ return 0;
+
+ seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+ for (i = 0; i < client->monc.monmap->num_mon; i++) {
+ struct ceph_entity_inst *inst =
+ &client->monc.monmap->mon_inst[i];
+
+ seq_printf(s, "\t%s%lld\t%s\n",
+ ENTITY_NAME(inst->name),
+ pr_addr(&inst->addr.in_addr));
+ }
+ return 0;
+}
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->mdsc.mdsmap == NULL)
+ return 0;
+ seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+ seq_printf(s, "session_timeout %d\n",
+ client->mdsc.mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n",
+ client->mdsc.mdsmap->m_session_autoclose);
+ for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+ struct ceph_entity_addr *addr =
+ &client->mdsc.mdsmap->m_info[i].addr;
+ int state = client->mdsc.mdsmap->m_info[i].state;
+
+ seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+ ceph_mds_state_name(state));
+ }
+ return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->osdc.osdmap == NULL)
+ return 0;
+ seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+ seq_printf(s, "flags%s%s\n",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+ " NEARFULL" : "",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+ " FULL" : "");
+ for (i = 0; i < client->osdc.osdmap->num_pools; i++) {
+ struct ceph_pg_pool_info *pool =
+ &client->osdc.osdmap->pg_pool[i];
+ seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+ i, pool->v.pg_num, pool->pg_num_mask,
+ pool->v.lpg_num, pool->lpg_num_mask);
+ }
+ for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+ struct ceph_entity_addr *addr =
+ &client->osdc.osdmap->osd_addr[i];
+ int state = client->osdc.osdmap->osd_state[i];
+ char sb[64];
+
+ seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+ i, pr_addr(&addr->in_addr),
+ ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state));
+ }
+ return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mon_statfs_request *req;
+ u64 nexttid = 0;
+ int got;
+ struct ceph_mon_client *monc = &client->monc;
+
+ mutex_lock(&monc->mutex);
+
+ if (monc->have_mdsmap)
+ seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+ if (monc->have_osdmap)
+ seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+ if (monc->want_next_osdmap)
+ seq_printf(s, "want next osdmap\n");
+
+ while (nexttid < monc->last_tid) {
+ got = radix_tree_gang_lookup(&monc->statfs_request_tree,
+ (void **)&req, nexttid, 1);
+ if (got == 0)
+ break;
+ nexttid = req->tid + 1;
+
+ seq_printf(s, "%lld statfs\n", req->tid);
+ }
+ mutex_unlock(&monc->mutex);
+
+ return 0;
+}
+
+static int mdsc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mds_request *req;
+ u64 nexttid = 0;
+ int got;
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ int pathlen;
+ u64 pathbase;
+ char *path;
+
+ mutex_lock(&mdsc->mutex);
+ while (nexttid < mdsc->last_tid) {
+ got = radix_tree_gang_lookup(&mdsc->request_tree,
+ (void **)&req, nexttid, 1);
+ if (got == 0)
+ break;
+ nexttid = req->r_tid + 1;
+
+ if (req->r_request)
+ seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+ else
+ seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+
+ seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+ if (req->r_got_unsafe)
+ seq_printf(s, "\t(unsafe)");
+ else
+ seq_printf(s, "\t");
+
+ if (req->r_inode) {
+ seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+ } else if (req->r_dentry) {
+ path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &pathbase, 0);
+ spin_lock(&req->r_dentry->d_lock);
+ seq_printf(s, " #%llx/%.*s (%s)",
+ ceph_ino(req->r_dentry->d_parent->d_inode),
+ req->r_dentry->d_name.len,
+ req->r_dentry->d_name.name,
+ path ? path : "");
+ spin_unlock(&req->r_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path1) {
+ seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+ req->r_path1);
+ }
+
+ if (req->r_old_dentry) {
+ path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+ &pathbase, 0);
+ spin_lock(&req->r_old_dentry->d_lock);
+ seq_printf(s, " #%llx/%.*s (%s)",
+ ceph_ino(req->r_old_dentry->d_parent->d_inode),
+ req->r_old_dentry->d_name.len,
+ req->r_old_dentry->d_name.name,
+ path ? path : "");
+ spin_unlock(&req->r_old_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path2) {
+ if (req->r_ino2.ino)
+ seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+ req->r_path2);
+ else
+ seq_printf(s, " %s", req->r_path2);
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *p;
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req;
+ struct ceph_osd_request_head *head;
+ struct ceph_osd_op *op;
+ int num_ops;
+ int opcode, olen;
+ int i;
+
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ seq_printf(s, "%lld\tosd%d\t", req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ head = req->r_request->front.iov_base;
+ op = (void *)(head + 1);
+
+ num_ops = le16_to_cpu(head->num_ops);
+ olen = le32_to_cpu(head->object_len);
+ seq_printf(s, "%.*s", olen,
+ (const char *)(head->ops + num_ops));
+
+ if (req->r_reassert_version.epoch)
+ seq_printf(s, "\t%u'%llu",
+ (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+ le64_to_cpu(req->r_reassert_version.version));
+ else
+ seq_printf(s, "\t");
+
+ for (i = 0; i < num_ops; i++) {
+ opcode = le16_to_cpu(op->op);
+ seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+ op++;
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&osdc->request_mutex);
+ return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = p;
+ int total, avail, used, reserved;
+
+ ceph_reservation_status(client, &total, &avail, &used, &reserved);
+ seq_printf(s, "total\t\t%d\n"
+ "avail\t\t%d\n"
+ "used\t\t%d\n"
+ "reserved\t%d\n",
+ total, avail, used, reserved);
+ return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_dentry_info *di;
+
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+ struct dentry *dentry = di->dentry;
+ seq_printf(s, "%p %p\t%.*s\n",
+ di, dentry, dentry->d_name.len, dentry->d_name.name);
+ }
+ spin_unlock(&mdsc->dentry_lru_lock);
+
+ return 0;
+}
+
+#define DEFINE_SHOW_FUNC(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ struct seq_file *sf; \
+ int ret; \
+ \
+ ret = single_open(file, name, NULL); \
+ sf = file->private_data; \
+ sf->private = inode->i_private; \
+ return ret; \
+} \
+ \
+static const struct file_operations name##_fops = { \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+};
+
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+
+int __init ceph_debugfs_init(void)
+{
+ ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+ if (!ceph_debugfs_dir)
+ return -ENOMEM;
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+ debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ int ret = 0;
+ char name[80];
+
+ snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+ PR_FSID(&client->fsid), client->monc.auth->global_id);
+
+ client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+ if (!client->debugfs_dir)
+ goto out;
+
+ client->monc.debugfs_file = debugfs_create_file("monc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monc_show_fops);
+ if (!client->monc.debugfs_file)
+ goto out;
+
+ client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &mdsc_show_fops);
+ if (!client->mdsc.debugfs_file)
+ goto out;
+
+ client->osdc.debugfs_file = debugfs_create_file("osdc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdc_show_fops);
+ if (!client->osdc.debugfs_file)
+ goto out;
+
+ client->debugfs_monmap = debugfs_create_file("monmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monmap_show_fops);
+ if (!client->debugfs_monmap)
+ goto out;
+
+ client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &mdsmap_show_fops);
+ if (!client->debugfs_mdsmap)
+ goto out;
+
+ client->debugfs_osdmap = debugfs_create_file("osdmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdmap_show_fops);
+ if (!client->debugfs_osdmap)
+ goto out;
+
+ client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &dentry_lru_show_fops);
+ if (!client->debugfs_dentry_lru)
+ goto out;
+
+ client->debugfs_caps = debugfs_create_file("caps",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &caps_show_fops);
+ if (!client->debugfs_caps)
+ goto out;
+
+ return 0;
+
+out:
+ ceph_debugfs_client_cleanup(client);
+ return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+ debugfs_remove(client->debugfs_caps);
+ debugfs_remove(client->debugfs_dentry_lru);
+ debugfs_remove(client->debugfs_osdmap);
+ debugfs_remove(client->debugfs_mdsmap);
+ debugfs_remove(client->debugfs_monmap);
+ debugfs_remove(client->osdc.debugfs_file);
+ debugfs_remove(client->mdsc.debugfs_file);
+ debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_dir);
+}
+
+#else // CONFIG_DEBUG_FS
+
+int __init ceph_debugfs_init(void)
+{
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..10de84896244
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,159 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ * void **p pointer to position pointer
+ * void *end pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+ u64 v = get_unaligned_le64(*p);
+ *p += sizeof(u64);
+ return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+ u32 v = get_unaligned_le32(*p);
+ *p += sizeof(u32);
+ return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+ u16 v = get_unaligned_le16(*p);
+ *p += sizeof(u16);
+ return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+ u8 v = *(u8 *)*p;
+ (*p)++;
+ return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+ memcpy(pv, *p, n);
+ *p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad) \
+ do { \
+ if (unlikely(*(p) + (n) > (end))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u64), bad); \
+ v = ceph_decode_64(p); \
+ } while (0)
+#define ceph_decode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u32), bad); \
+ v = ceph_decode_32(p); \
+ } while (0)
+#define ceph_decode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u16), bad); \
+ v = ceph_decode_16(p); \
+ } while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ ceph_decode_copy(p, pv, n); \
+ } while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+ const struct ceph_timespec *tv)
+{
+ ts->tv_sec = le32_to_cpu(tv->tv_sec);
+ ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+ const struct timespec *ts)
+{
+ tv->tv_sec = cpu_to_le32(ts->tv_sec);
+ tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+ a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+ a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+ WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+ put_unaligned_le64(v, (__le64 *)*p);
+ *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+ put_unaligned_le32(v, (__le32 *)*p);
+ *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+ put_unaligned_le16(v, (__le16 *)*p);
+ *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+ *(u8 *)*p = v;
+ (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+ u64 ino, const char *path)
+{
+ u32 len = path ? strlen(path) : 0;
+ BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+ ceph_encode_64(p, ino);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, path, len);
+ *p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+ const char *s, u32 len)
+{
+ BUG_ON(*p + sizeof(len) + len > end);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+
+#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..32ef54367224
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1214 @@
+#include "ceph_debug.h"
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include "super.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path. Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino). The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ if (dentry->d_fsdata)
+ return 0;
+
+ if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+ dentry->d_op = &ceph_dentry_ops;
+ else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+ dentry->d_op = &ceph_snapdir_dentry_ops;
+ else
+ dentry->d_op = &ceph_snap_dentry_ops;
+
+ di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+ if (!di)
+ return -ENOMEM; /* oh well */
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_fsdata) /* lost a race */
+ goto out_unlock;
+ di->dentry = dentry;
+ di->lease_session = NULL;
+ dentry->d_fsdata = di;
+ dentry->d_time = jiffies;
+ ceph_dentry_lru_add(dentry);
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ return 0;
+}
+
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+ return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+ return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache. We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir. It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_file_info *fi = filp->private_data;
+ struct dentry *parent = filp->f_dentry;
+ struct inode *dir = parent->d_inode;
+ struct list_head *p;
+ struct dentry *dentry, *last;
+ struct ceph_dentry_info *di;
+ int err = 0;
+
+ /* claim ref on last dentry we returned */
+ last = fi->dentry;
+ fi->dentry = NULL;
+
+ dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+ last);
+
+ spin_lock(&dcache_lock);
+
+ /* start at beginning? */
+ if (filp->f_pos == 2 || (last &&
+ filp->f_pos < ceph_dentry(last)->offset)) {
+ if (list_empty(&parent->d_subdirs))
+ goto out_unlock;
+ p = parent->d_subdirs.prev;
+ dout(" initial p %p/%p\n", p->prev, p->next);
+ } else {
+ p = last->d_u.d_child.prev;
+ }
+
+more:
+ dentry = list_entry(p, struct dentry, d_u.d_child);
+ di = ceph_dentry(dentry);
+ while (1) {
+ dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+ parent->d_subdirs.prev, parent->d_subdirs.next);
+ if (p == &parent->d_subdirs) {
+ fi->at_end = 1;
+ goto out_unlock;
+ }
+ if (!d_unhashed(dentry) && dentry->d_inode &&
+ ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+ filp->f_pos <= di->offset)
+ break;
+ dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, di->offset,
+ filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+ !dentry->d_inode ? " null" : "");
+ p = p->prev;
+ dentry = list_entry(p, struct dentry, d_u.d_child);
+ di = ceph_dentry(dentry);
+ }
+
+ atomic_inc(&dentry->d_count);
+ spin_unlock(&dcache_lock);
+ spin_unlock(&inode->i_lock);
+
+ dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+ dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ filp->f_pos = di->offset;
+ err = filldir(dirent, dentry->d_name.name,
+ dentry->d_name.len, di->offset,
+ dentry->d_inode->i_ino,
+ dentry->d_inode->i_mode >> 12);
+
+ if (last) {
+ if (err < 0) {
+ /* remember our position */
+ fi->dentry = last;
+ fi->next_offset = di->offset;
+ } else {
+ dput(last);
+ }
+ last = NULL;
+ }
+
+ spin_lock(&inode->i_lock);
+ spin_lock(&dcache_lock);
+
+ if (err < 0)
+ goto out_unlock;
+
+ last = dentry;
+
+ p = p->prev;
+ filp->f_pos++;
+
+ /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+ if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+ goto more;
+ dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ err = -EAGAIN;
+
+out_unlock:
+ spin_unlock(&dcache_lock);
+
+ if (last) {
+ spin_unlock(&inode->i_lock);
+ dput(last);
+ spin_lock(&inode->i_lock);
+ }
+
+ return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+ int len)
+{
+ kfree(fi->last_name);
+ fi->last_name = kmalloc(len+1, GFP_NOFS);
+ if (!fi->last_name)
+ return -ENOMEM;
+ memcpy(fi->last_name, name, len);
+ fi->last_name[len] = 0;
+ dout("note_last_dentry '%s'\n", fi->last_name);
+ return 0;
+}
+
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct ceph_file_info *fi = filp->private_data;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ unsigned frag = fpos_frag(filp->f_pos);
+ int off = fpos_off(filp->f_pos);
+ int err;
+ u32 ftype;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ const int max_entries = client->mount_args->max_readdir;
+
+ dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+ if (fi->at_end)
+ return 0;
+
+ /* always start with . and .. */
+ if (filp->f_pos == 0) {
+ /* note dir version at start of readdir so we can tell
+ * if any dentries get dropped */
+ fi->dir_release_count = ci->i_release_count;
+
+ dout("readdir off 0 -> '.'\n");
+ if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+ inode->i_ino, inode->i_mode >> 12) < 0)
+ return 0;
+ filp->f_pos = 1;
+ off = 1;
+ }
+ if (filp->f_pos == 1) {
+ dout("readdir off 1 -> '..'\n");
+ if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+ filp->f_dentry->d_parent->d_inode->i_ino,
+ inode->i_mode >> 12) < 0)
+ return 0;
+ filp->f_pos = 2;
+ off = 2;
+ }
+
+ /* can we use the dcache? */
+ spin_lock(&inode->i_lock);
+ if ((filp->f_pos == 2 || fi->dentry) &&
+ !ceph_test_opt(client, NOASYNCREADDIR) &&
+ (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ err = __dcache_readdir(filp, dirent, filldir);
+ if (err != -EAGAIN) {
+ spin_unlock(&inode->i_lock);
+ return err;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ if (fi->dentry) {
+ err = note_last_dentry(fi, fi->dentry->d_name.name,
+ fi->dentry->d_name.len);
+ if (err)
+ return err;
+ dput(fi->dentry);
+ fi->dentry = NULL;
+ }
+
+ /* proceed with a normal readdir */
+
+more:
+ /* do we have the correct frag content buffered? */
+ if (fi->frag != frag || fi->last_readdir == NULL) {
+ struct ceph_mds_request *req;
+ int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+ /* discard old result, if any */
+ if (fi->last_readdir)
+ ceph_mdsc_put_request(fi->last_readdir);
+
+ /* requery frag tree, as the frag topology may have changed */
+ frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+ dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+ ceph_vinop(inode), frag, fi->last_name);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+ req->r_dentry = dget(filp->f_dentry);
+ /* hints to request -> mds selection code */
+ req->r_direct_mode = USE_AUTH_MDS;
+ req->r_direct_hash = ceph_frag_value(frag);
+ req->r_direct_is_hash = true;
+ req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+ req->r_readdir_offset = fi->next_offset;
+ req->r_args.readdir.frag = cpu_to_le32(frag);
+ req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+ req->r_num_caps = max_entries;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
+ dout("readdir got and parsed readdir result=%d"
+ " on frag %x, end=%d, complete=%d\n", err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete);
+
+ if (!req->r_did_prepopulate) {
+ dout("readdir !did_prepopulate");
+ fi->dir_release_count--; /* preclude I_COMPLETE */
+ }
+
+ /* note next offset and last dentry name */
+ fi->offset = fi->next_offset;
+ fi->last_readdir = req;
+
+ if (req->r_reply_info.dir_end) {
+ kfree(fi->last_name);
+ fi->last_name = NULL;
+ fi->next_offset = 0;
+ } else {
+ rinfo = &req->r_reply_info;
+ err = note_last_dentry(fi,
+ rinfo->dir_dname[rinfo->dir_nr-1],
+ rinfo->dir_dname_len[rinfo->dir_nr-1]);
+ if (err)
+ return err;
+ fi->next_offset += rinfo->dir_nr;
+ }
+ }
+
+ rinfo = &fi->last_readdir->r_reply_info;
+ dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+ rinfo->dir_nr, off, fi->offset);
+ while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+ u64 pos = ceph_make_fpos(frag, off);
+ struct ceph_mds_reply_inode *in =
+ rinfo->dir_in[off - fi->offset].in;
+ dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+ off, off - fi->offset, rinfo->dir_nr, pos,
+ rinfo->dir_dname_len[off - fi->offset],
+ rinfo->dir_dname[off - fi->offset], in);
+ BUG_ON(!in);
+ ftype = le32_to_cpu(in->mode) >> 12;
+ if (filldir(dirent,
+ rinfo->dir_dname[off - fi->offset],
+ rinfo->dir_dname_len[off - fi->offset],
+ pos,
+ le64_to_cpu(in->ino),
+ ftype) < 0) {
+ dout("filldir stopping us...\n");
+ return 0;
+ }
+ off++;
+ filp->f_pos = pos + 1;
+ }
+
+ if (fi->last_name) {
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+ goto more;
+ }
+
+ /* more frags? */
+ if (!ceph_frag_is_rightmost(frag)) {
+ frag = ceph_frag_next(frag);
+ off = 0;
+ filp->f_pos = ceph_make_fpos(frag, off);
+ dout("readdir next frag is %x\n", frag);
+ goto more;
+ }
+ fi->at_end = 1;
+
+ /*
+ * if dir_release_count still matches the dir, no dentries
+ * were released during the whole readdir, and we should have
+ * the complete dir contents in our cache.
+ */
+ spin_lock(&inode->i_lock);
+ if (ci->i_release_count == fi->dir_release_count) {
+ dout(" marking %p complete\n", inode);
+ ci->i_ceph_flags |= CEPH_I_COMPLETE;
+ ci->i_max_offset = filp->f_pos;
+ }
+ spin_unlock(&inode->i_lock);
+
+ dout("readdir %p filp %p done.\n", inode, filp);
+ return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi)
+{
+ if (fi->last_readdir) {
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+ }
+ kfree(fi->last_name);
+ fi->next_offset = 2; /* compensate for . and .. */
+ if (fi->dentry) {
+ dput(fi->dentry);
+ fi->dentry = NULL;
+ }
+ fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file->f_mapping->host;
+ loff_t old_offset = offset;
+ loff_t retval;
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_END:
+ offset += inode->i_size + 2; /* FIXME */
+ break;
+ case SEEK_CUR:
+ offset += file->f_pos;
+ }
+ retval = -EINVAL;
+ if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ fi->at_end = 0;
+ }
+ retval = offset;
+
+ /*
+ * discard buffered readdir content on seekdir(0), or
+ * seek to new frag, or seek prior to current chunk.
+ */
+ if (offset == 0 ||
+ fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_off(offset) < fi->offset) {
+ dout("dir_llseek dropping %p content\n", file);
+ reset_readdir(fi);
+ }
+
+ /* bump dir_release_count if we did a forward seek */
+ if (offset > old_offset)
+ fi->dir_release_count--;
+ }
+ mutex_unlock(&inode->i_mutex);
+ return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+ struct dentry *dentry, int err)
+{
+ struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct inode *parent = dentry->d_parent->d_inode;
+
+ /* .snap dir? */
+ if (err == -ENOENT &&
+ ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+ strcmp(dentry->d_name.name,
+ client->mount_args->snapdir_name) == 0) {
+ struct inode *inode = ceph_get_snapdir(parent);
+ dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+ dentry, dentry->d_name.len, dentry->d_name.name, inode);
+ d_add(dentry, inode);
+ err = 0;
+ }
+
+ if (err == -ENOENT) {
+ /* no trace? */
+ err = 0;
+ if (!req->r_reply_info.head->is_dentry) {
+ dout("ENOENT and no trace, dentry %p inode %p\n",
+ dentry, dentry->d_inode);
+ if (dentry->d_inode) {
+ d_drop(dentry);
+ err = -ENOENT;
+ } else {
+ d_add(dentry, NULL);
+ }
+ }
+ }
+ if (err)
+ dentry = ERR_PTR(err);
+ else if (dentry != req->r_dentry)
+ dentry = dget(req->r_dentry); /* we got spliced */
+ else
+ dentry = NULL;
+ return dentry;
+}
+
+/*
+ * Look up a single dir entry. If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int op;
+ int err;
+
+ dout("lookup %p dentry %p '%.*s'\n",
+ dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ err = ceph_init_dentry(dentry);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* open (but not create!) intent? */
+ if (nd &&
+ (nd->flags & LOOKUP_OPEN) &&
+ (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+ !(nd->intent.open.flags & O_CREAT)) {
+ int mode = nd->intent.open.create_mode & ~current->fs->umask;
+ return ceph_lookup_open(dir, dentry, nd, mode, 1);
+ }
+
+ /* can we conclude ENOENT locally? */
+ if (dentry->d_inode == NULL) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ spin_lock(&dir->i_lock);
+ dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ if (strncmp(dentry->d_name.name,
+ client->mount_args->snapdir_name,
+ dentry->d_name.len) &&
+ (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ di->offset = ci->i_max_offset++;
+ spin_unlock(&dir->i_lock);
+ dout(" dir %p complete, -ENOENT\n", dir);
+ d_add(dentry, NULL);
+ di->lease_shared_gen = ci->i_shared_gen;
+ return NULL;
+ }
+ spin_unlock(&dir->i_lock);
+ }
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ /* we only need inode linkage */
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+ req->r_locked_dir = dir;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ dentry = ceph_finish_lookup(req, dentry, err);
+ ceph_mdsc_put_request(req); /* will dput(dentry) */
+ dout("lookup result=%p\n", dentry);
+ return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+ if (result && !IS_ERR(result)) {
+ /*
+ * We created the item, then did a lookup, and found
+ * it was already linked to another inode we already
+ * had in our cache (and thus got spliced). Link our
+ * dentry to that inode, but don't hash it, just in
+ * case the VFS wants to dereference it.
+ */
+ BUG_ON(!result->d_inode);
+ d_instantiate(dentry, result->d_inode);
+ return 0;
+ }
+ return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+ dir, dentry, mode, rdev);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_args.mknod.mode = cpu_to_le32(mode);
+ req->r_args.mknod.rdev = cpu_to_le32(rdev);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+ if (err)
+ d_drop(dentry);
+ return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ dout("create in dir %p dentry %p name '%.*s'\n",
+ dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (nd) {
+ BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+ dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+ /* hrm, what should i do here if we get aliased? */
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ return 0;
+ }
+
+ /* fall back to mknod */
+ return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+ const char *dest)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_path2 = kstrdup(dest, GFP_NOFS);
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+ if (err)
+ d_drop(dentry);
+ return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* mkdir .snap/foo is a MKSNAP */
+ op = CEPH_MDS_OP_MKSNAP;
+ dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+ dentry->d_name.len, dentry->d_name.name, dentry);
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+ op = CEPH_MDS_OP_MKDIR;
+ } else {
+ goto out;
+ }
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_args.mkdir.mode = cpu_to_le32(mode);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+out:
+ if (err < 0)
+ d_drop(dentry);
+ return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("link in dir %p old_dentry %p dentry %p\n", dir,
+ old_dentry, dentry);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (err)
+ d_drop(dentry);
+ else if (!req->r_reply_info.head->is_dentry)
+ d_instantiate(dentry, igrab(old_dentry->d_inode));
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ }
+ spin_unlock(&inode->i_lock);
+ return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct inode *inode = dentry->d_inode;
+ struct ceph_mds_request *req;
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* rmdir .snap/foo is RMSNAP */
+ dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+ dentry->d_name.name, dentry);
+ op = CEPH_MDS_OP_RMSNAP;
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("unlink/rmdir dir %p dn %p inode %p\n",
+ dir, dentry, inode);
+ op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+ CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+ } else
+ goto out;
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_inode_drop = drop_caps_for_unlink(inode);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ ceph_mdsc_put_request(req);
+out:
+ return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(old_dir) != ceph_snap(new_dir))
+ return -EXDEV;
+ if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+ ceph_snap(new_dir) != CEPH_NOSNAP)
+ return -EROFS;
+ dout("rename dir %p dentry %p to dir %p dentry %p\n",
+ old_dir, old_dentry, new_dir, new_dentry);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_dentry = dget(new_dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry);
+ req->r_locked_dir = new_dir;
+ req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_RDCACHE on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+ if (new_dentry->d_inode)
+ req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+ err = ceph_mdsc_do_request(mdsc, old_dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry) {
+ /*
+ * Normally d_move() is done by fill_trace (called by
+ * do_request, above). If there is no trace, we need
+ * to do it here.
+ */
+ d_move(old_dentry, new_dentry);
+ }
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+
+/*
+ * Check if dentry lease is valid. If not, delete the lease. Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+ struct ceph_mds_session *s;
+ int valid = 0;
+ u32 gen;
+ unsigned long ttl;
+ struct ceph_mds_session *session = NULL;
+ struct inode *dir = NULL;
+ u32 seq = 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (di && di->lease_session) {
+ s = di->lease_session;
+ spin_lock(&s->s_cap_lock);
+ gen = s->s_cap_gen;
+ ttl = s->s_cap_ttl;
+ spin_unlock(&s->s_cap_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, ttl)) {
+ valid = 1;
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /* we should renew */
+ dir = dentry->d_parent->d_inode;
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
+ } else {
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (session) {
+ ceph_mdsc_lease_send_msg(session, dir, dentry,
+ CEPH_MDS_LEASE_RENEW, seq);
+ ceph_put_mds_session(session);
+ }
+ dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int valid = 0;
+
+ spin_lock(&dir->i_lock);
+ if (ci->i_shared_gen == di->lease_shared_gen)
+ valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ spin_unlock(&dir->i_lock);
+ dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+ dir, (unsigned)ci->i_shared_gen, dentry,
+ (unsigned)di->lease_shared_gen, valid);
+ return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+
+ /* always trust cached snapped dentries, snapdir dentry */
+ if (ceph_snap(dir) != CEPH_NOSNAP) {
+ dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ goto out_touch;
+ }
+ if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+ goto out_touch;
+
+ if (dentry_lease_is_valid(dentry) ||
+ dir_lease_is_valid(dir, dentry))
+ goto out_touch;
+
+ dout("d_revalidate %p invalid\n", dentry);
+ d_drop(dentry);
+ return 0;
+out_touch:
+ ceph_dentry_lru_touch(dentry);
+ return 1;
+}
+
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *parent_inode = dentry->d_parent->d_inode;
+
+ if (parent_inode) {
+ struct ceph_inode_info *ci = ceph_inode(parent_inode);
+
+ spin_lock(&parent_inode->i_lock);
+ if (ci->i_shared_gen == di->lease_shared_gen) {
+ dout(" clearing %p complete (d_release)\n",
+ parent_inode);
+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ ci->i_release_count++;
+ }
+ spin_unlock(&parent_inode->i_lock);
+ }
+ if (di) {
+ ceph_dentry_lru_del(dentry);
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+ dentry->d_fsdata = NULL;
+ }
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+ struct nameidata *nd)
+{
+ /*
+ * Eventually, we'll want to revalidate snapped metadata
+ * too... probably...
+ */
+ return 1;
+}
+
+
+
+/*
+ * read() on a dir. This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct ceph_file_info *cf = file->private_data;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int left;
+
+ if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+ return -EISDIR;
+
+ if (!cf->dir_info) {
+ cf->dir_info = kmalloc(1024, GFP_NOFS);
+ if (!cf->dir_info)
+ return -ENOMEM;
+ cf->dir_info_len =
+ sprintf(cf->dir_info,
+ "entries: %20lld\n"
+ " files: %20lld\n"
+ " subdirs: %20lld\n"
+ "rentries: %20lld\n"
+ " rfiles: %20lld\n"
+ " rsubdirs: %20lld\n"
+ "rbytes: %20lld\n"
+ "rctime: %10ld.%09ld\n",
+ ci->i_files + ci->i_subdirs,
+ ci->i_files,
+ ci->i_subdirs,
+ ci->i_rfiles + ci->i_rsubdirs,
+ ci->i_rfiles,
+ ci->i_rsubdirs,
+ ci->i_rbytes,
+ (long)ci->i_rctime.tv_sec,
+ (long)ci->i_rctime.tv_nsec);
+ }
+
+ if (*ppos >= cf->dir_info_len)
+ return 0;
+ size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+ left = copy_to_user(buf, cf->dir_info + *ppos, size);
+ if (left == size)
+ return -EFAULT;
+ *ppos += (size - left);
+ return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+ int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_dirops;
+ struct ceph_mds_request *req;
+ u64 last_tid;
+ int ret = 0;
+
+ dout("dir_fsync %p\n", inode);
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ req = list_entry(head->prev,
+ struct ceph_mds_request, r_unsafe_dir_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_mdsc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+ dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+ inode, req->r_tid, last_tid);
+ if (req->r_timeout) {
+ ret = wait_for_completion_timeout(
+ &req->r_safe_completion, req->r_timeout);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ ret = -EIO; /* timed out */
+ } else {
+ wait_for_completion(&req->r_safe_completion);
+ }
+ spin_lock(&ci->i_unsafe_lock);
+ ceph_mdsc_put_request(req);
+
+ if (ret || list_empty(head))
+ break;
+ req = list_entry(head->next,
+ struct ceph_mds_request, r_unsafe_dir_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+ return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+ dout("dentry_lru_add %p %p\t%.*s\n",
+ di, dn, dn->d_name.len, dn->d_name.name);
+
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;
+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+ dout("dentry_lru_touch %p %p\t%.*s\n",
+ di, dn, dn->d_name.len, dn->d_name.name);
+
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);
+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_del %p %p\t%.*s\n",
+ di, dn, dn->d_name.len, dn->d_name.name);
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;
+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}
+
+const struct file_operations ceph_dir_fops = {
+ .read = ceph_read_dir,
+ .readdir = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+ .unlocked_ioctl = ceph_ioctl,
+ .fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .setattr = ceph_setattr,
+ .setxattr = ceph_setxattr,
+ .getxattr = ceph_getxattr,
+ .listxattr = ceph_listxattr,
+ .removexattr = ceph_removexattr,
+ .mknod = ceph_mknod,
+ .symlink = ceph_symlink,
+ .mkdir = ceph_mkdir,
+ .link = ceph_link,
+ .unlink = ceph_unlink,
+ .rmdir = ceph_unlink,
+ .rename = ceph_rename,
+ .create = ceph_create,
+};
+
+struct dentry_operations ceph_dentry_ops = {
+ .d_revalidate = ceph_d_revalidate,
+ .d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snapdir_dentry_ops = {
+ .d_revalidate = ceph_snapdir_d_revalidate,
+};
+
+struct dentry_operations ceph_snap_dentry_ops = {
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
+#include "ceph_debug.h"
+
+#include <linux/exportfs.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best. If you're lucky, your inode will be in the
+ * client's cache. If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you. Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+ u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+ u64 ino, parent_ino;
+ u32 parent_name_hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+ int connectable)
+{
+ struct ceph_nfs_fh *fh = (void *)rawfh;
+ struct ceph_nfs_confh *cfh = (void *)rawfh;
+ struct dentry *parent = dentry->d_parent;
+ struct inode *inode = dentry->d_inode;
+ int type;
+
+ /* don't re-export snaps */
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EINVAL;
+
+ if (*max_len >= sizeof(*cfh)) {
+ dout("encode_fh %p connectable\n", dentry);
+ cfh->ino = ceph_ino(dentry->d_inode);
+ cfh->parent_ino = ceph_ino(parent->d_inode);
+ cfh->parent_name_hash = parent->d_name.hash;
+ *max_len = sizeof(*cfh);
+ type = 2;
+ } else if (*max_len > sizeof(*fh)) {
+ if (connectable)
+ return -ENOSPC;
+ dout("encode_fh %p\n", dentry);
+ fh->ino = ceph_ino(dentry->d_inode);
+ *max_len = sizeof(*fh);
+ type = 1;
+ } else {
+ return -ENOSPC;
+ }
+ return type;
+}
+
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+ struct ceph_nfs_fh *fh)
+{
+ struct inode *inode;
+ struct dentry *dentry;
+ struct ceph_vino vino;
+ int err;
+
+ dout("__fh_to_dentry %llx\n", fh->ino);
+ vino.ino = fh->ino;
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
+
+ dentry = d_obtain_alias(inode);
+ if (!dentry) {
+ pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+ fh->ino, inode);
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ err = ceph_init_dentry(dentry);
+
+ if (err < 0) {
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+ return dentry;
+}
+
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+ struct ceph_nfs_confh *cfh)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+ struct inode *inode;
+ struct dentry *dentry;
+ struct ceph_vino vino;
+ int err;
+
+ dout("__cfh_to_dentry %llx (%llx/%x)\n",
+ cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+
+ vino.ino = cfh->ino;
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode) {
+ struct ceph_mds_request *req;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+
+ req->r_ino1 = vino;
+ req->r_ino2.ino = cfh->parent_ino;
+ req->r_ino2.snap = CEPH_NOSNAP;
+ req->r_path2 = kmalloc(16, GFP_NOFS);
+ snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+ inode = ceph_find_inode(sb, vino);
+ if (!inode)
+ return ERR_PTR(err ? err : -ESTALE);
+ }
+
+ dentry = d_obtain_alias(inode);
+ if (!dentry) {
+ pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+ cfh->ino, inode);
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ err = ceph_init_dentry(dentry);
+ if (err < 0) {
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ return dentry;
+}
+
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ if (fh_type == 1)
+ return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+ else
+ return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_confh *cfh = (void *)fid->raw;
+ struct ceph_vino vino;
+ struct inode *inode;
+ struct dentry *dentry;
+ int err;
+
+ if (fh_type == 1)
+ return ERR_PTR(-ESTALE);
+
+ pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+ cfh->parent_name_hash);
+
+ vino.ino = cfh->ino;
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
+
+ dentry = d_obtain_alias(inode);
+ if (!dentry) {
+ pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+ cfh->ino, inode);
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ err = ceph_init_dentry(dentry);
+ if (err < 0) {
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ return dentry;
+}
+
+const struct export_operations ceph_export_ops = {
+ .encode_fh = ceph_encode_fh,
+ .fh_to_dentry = ceph_fh_to_dentry,
+ .fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..fc8aff4767d3
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,904 @@
+#include "ceph_debug.h"
+
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ * - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ * - synchronous is used when there is multi-client read/write
+ * sharing, avoids the page cache, and synchronously waits for an
+ * ack from the OSD.
+ *
+ * - direct io takes the variant of the sync path that references
+ * user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request. Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+ struct ceph_client *client = ceph_sb_to_client(sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int want_auth = USE_ANY_MDS;
+ int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+ if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+ want_auth = USE_AUTH_MDS;
+
+ req = ceph_mdsc_create_request(mdsc, op, want_auth);
+ if (IS_ERR(req))
+ goto out;
+ req->r_fmode = ceph_flags_to_mode(flags);
+ req->r_args.open.flags = cpu_to_le32(flags);
+ req->r_args.open.mode = cpu_to_le32(create_mode);
+ req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+ return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+ struct ceph_file_info *cf;
+ int ret = 0;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ dout("init_file %p %p 0%o (regular)\n", inode, file,
+ inode->i_mode);
+ cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+ if (cf == NULL) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+ cf->fmode = fmode;
+ cf->next_offset = 2;
+ file->private_data = cf;
+ BUG_ON(inode->i_fop->release != ceph_release);
+ break;
+
+ case S_IFLNK:
+ dout("init_file %p %p 0%o (symlink)\n", inode, file,
+ inode->i_mode);
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ break;
+
+ default:
+ dout("init_file %p %p 0%o (special)\n", inode, file,
+ inode->i_mode);
+ /*
+ * we need to drop the open ref now, since we don't
+ * have .release set to ceph_release.
+ */
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ BUG_ON(inode->i_fop->release == ceph_release);
+
+ /* call the proper open fop */
+ ret = inode->i_fop->open(inode, file);
+ }
+ return ret;
+}
+
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS). We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_file_info *cf = file->private_data;
+ struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+ int err;
+ int flags, fmode, wanted;
+
+ if (cf) {
+ dout("open file %p is already opened\n", file);
+ return 0;
+ }
+
+ /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
+ flags = file->f_flags & ~(O_CREAT|O_EXCL);
+ if (S_ISDIR(inode->i_mode))
+ flags = O_DIRECTORY; /* mds likes to know */
+
+ dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+ ceph_vinop(inode), file, flags, file->f_flags);
+ fmode = ceph_flags_to_mode(flags);
+ wanted = ceph_caps_for_mode(fmode);
+
+ /* snapped files are read-only */
+ if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+ return -EROFS;
+
+ /* trivially open snapdir */
+ if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ spin_lock(&inode->i_lock);
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&inode->i_lock);
+ return ceph_init_file(inode, file, fmode);
+ }
+
+ /*
+ * No need to block if we have any caps. Update wanted set
+ * asynchronously.
+ */
+ spin_lock(&inode->i_lock);
+ if (__ceph_is_any_real_caps(ci)) {
+ int mds_wanted = __ceph_caps_mds_wanted(ci);
+ int issued = __ceph_caps_issued(ci, NULL);
+
+ dout("open %p fmode %d want %s issued %s using existing\n",
+ inode, fmode, ceph_cap_string(wanted),
+ ceph_cap_string(issued));
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&inode->i_lock);
+
+ /* adjust wanted? */
+ if ((issued & wanted) != wanted &&
+ (mds_wanted & wanted) != wanted &&
+ ceph_snap(inode) != CEPH_SNAPDIR)
+ ceph_check_caps(ci, 0, NULL);
+
+ return ceph_init_file(inode, file, fmode);
+ } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+ (ci->i_snap_caps & wanted) == wanted) {
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&inode->i_lock);
+ return ceph_init_file(inode, file, fmode);
+ }
+ spin_unlock(&inode->i_lock);
+
+ dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_inode = igrab(inode);
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ if (!err)
+ err = ceph_init_file(inode, file, req->r_fmode);
+ ceph_mdsc_put_request(req);
+ dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+ return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called). So fear not!
+ */
+/*
+ * flags
+ * path_lookup_open -> LOOKUP_OPEN
+ * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd, int mode,
+ int locked_dir)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct file *file = nd->intent.open.file;
+ struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+ struct ceph_mds_request *req;
+ int err;
+ int flags = nd->intent.open.flags - 1; /* silly vfs! */
+
+ dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+ dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+
+ /* do the open */
+ req = prepare_open_request(dir->i_sb, flags, mode);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ if (flags & O_CREAT) {
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ }
+ req->r_locked_dir = dir; /* caller holds dir->i_mutex */
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ dentry = ceph_finish_lookup(req, dentry, err);
+ if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ if (!err)
+ err = ceph_init_file(req->r_dentry->d_inode, file,
+ req->r_fmode);
+ ceph_mdsc_put_request(req);
+ dout("ceph_lookup_open result=%p\n", dentry);
+ return dentry;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *cf = file->private_data;
+
+ dout("release inode %p file %p\n", inode, file);
+ ceph_put_fmode(ci, cf->fmode);
+ if (cf->last_readdir)
+ ceph_mdsc_put_request(cf->last_readdir);
+ kfree(cf->last_name);
+ kfree(cf->dir_info);
+ dput(cf->dentry);
+ kmem_cache_free(ceph_file_cachep, cf);
+ return 0;
+}
+
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+ int num_pages,
+ loff_t off, size_t len)
+{
+ struct page **pages;
+ int rc;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_user_pages(current, current->mm, (unsigned long)data,
+ num_pages, 0, 0, pages, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ goto fail;
+ return pages;
+
+fail:
+ kfree(pages);
+ return ERR_PTR(rc);
+}
+
+static void put_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+ kfree(pages);
+}
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ __free_pages(pages[i], 0);
+ kfree(pages);
+}
+
+/*
+ * allocate a vector new pages
+ */
+static struct page **alloc_page_vector(int num_pages)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (pages[i] == NULL) {
+ ceph_release_page_vector(pages, i);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ return pages;
+}
+
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+ const char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, PAGE_CACHE_SIZE-po, left);
+ bad = copy_from_user(page_address(pages[i]) + po, data, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ if (po) {
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE)
+ po = 0;
+ }
+ }
+ return len;
+}
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, left, PAGE_CACHE_SIZE-po);
+ bad = copy_to_user(data, page_address(pages[i]) + po, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ if (po) {
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE)
+ po = 0;
+ }
+ i++;
+ }
+ return len;
+}
+
+/*
+ * Zero an extent within a page vector. Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+ int i = off >> PAGE_CACHE_SHIFT;
+
+ dout("zero_page_vector_page %u~%u\n", off, len);
+ BUG_ON(len < PAGE_CACHE_SIZE);
+
+ /* leading partial page? */
+ if (off & ~PAGE_CACHE_MASK) {
+ dout("zeroing %d %p head from %d\n", i, pages[i],
+ (int)(off & ~PAGE_CACHE_MASK));
+ zero_user_segment(pages[i], off & ~PAGE_CACHE_MASK,
+ PAGE_CACHE_SIZE);
+ off += PAGE_CACHE_SIZE;
+ off &= PAGE_CACHE_MASK;
+ i++;
+ }
+ while (len >= PAGE_CACHE_SIZE) {
+ dout("zeroing %d %p\n", i, pages[i]);
+ zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ off += PAGE_CACHE_SIZE;
+ len -= PAGE_CACHE_SIZE;
+ i++;
+ }
+ /* trailing partial page? */
+ if (len) {
+ dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+ zero_user_segment(pages[i], 0, len);
+ }
+}
+
+
+/*
+ * Read a range of bytes striped over one or more objects. Iterate over
+ * objects we stripe over. (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+ u64 off, u64 len,
+ struct page **pages, int num_pages)
+{
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 pos, this_len;
+ int page_off = off & ~PAGE_CACHE_SIZE; /* first byte's offset in page */
+ int left, pages_left;
+ int read;
+ struct page **page_pos;
+ int ret;
+ bool hit_stripe, was_short;
+
+ /*
+ * we may need to do multiple reads. not atomic, unfortunately.
+ */
+ pos = off;
+ left = len;
+ page_pos = pages;
+ pages_left = num_pages;
+ read = 0;
+
+more:
+ this_len = left;
+ ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+ &ci->i_layout, pos, &this_len,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ page_pos, pages_left);
+ hit_stripe = this_len < left;
+ was_short = ret >= 0 && ret < this_len;
+ if (ret == -ENOENT)
+ ret = 0;
+ dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+ ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+ if (ret > 0) {
+ int didpages =
+ ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
+ if (read < pos - off) {
+ dout(" zero gap %llu to %llu\n", off + read, pos);
+ zero_page_vector_range(page_off + read,
+ pos - off - read, pages);
+ }
+ pos += ret;
+ read = pos - off;
+ left -= ret;
+ page_pos += didpages;
+ pages_left -= didpages;
+
+ /* hit stripe? */
+ if (left && hit_stripe)
+ goto more;
+ }
+
+ if (was_short) {
+ /* was original extent fully inside i_size? */
+ if (pos + left <= inode->i_size) {
+ dout("zero tail\n");
+ zero_page_vector_range(page_off + read, len - read,
+ pages);
+ goto out;
+ }
+
+ /* check i_size */
+ ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ if (ret < 0)
+ goto out;
+
+ /* hit EOF? */
+ if (pos >= inode->i_size)
+ goto out;
+
+ goto more;
+ }
+
+out:
+ if (ret >= 0)
+ ret = read;
+ dout("striped_read returns %d\n", ret);
+ return ret;
+}
+
+/*
+ * Completely synchronous read and write methods. Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+ unsigned len, loff_t *poff)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct page **pages;
+ u64 off = *poff;
+ int num_pages = calc_pages_for(off, len);
+ int ret;
+
+ dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+ (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ if (file->f_flags & O_DIRECT) {
+ pages = get_direct_page_vector(data, num_pages, off, len);
+
+ /*
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and O_DIRECT io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
+ */
+ filemap_write_and_wait(inode->i_mapping);
+ } else {
+ pages = alloc_page_vector(num_pages);
+ }
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = striped_read(inode, off, len, pages, num_pages);
+
+ if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+ ret = copy_page_vector_to_user(pages, data, off, ret);
+ if (ret >= 0)
+ *poff = off + ret;
+
+ if (file->f_flags & O_DIRECT)
+ put_page_vector(pages, num_pages);
+ else
+ ceph_release_page_vector(pages, num_pages);
+ dout("sync_read result %d\n", ret);
+ return ret;
+}
+
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+ dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes. (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+ size_t left, loff_t *offset)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages;
+ long long unsigned pos;
+ u64 len;
+ int written = 0;
+ int flags;
+ int do_sync = 0;
+ int check_caps = 0;
+ int ret;
+ struct timespec mtime = CURRENT_TIME;
+
+ if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+ (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ if (file->f_flags & O_APPEND)
+ pos = i_size_read(inode);
+ else
+ pos = *offset;
+
+ flags = CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE;
+ if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+ flags |= CEPH_OSD_FLAG_ACK;
+ else
+ do_sync = 1;
+
+ /*
+ * we may need to do multiple writes here if we span an object
+ * boundary. this isn't atomic, unfortunately. :(
+ */
+more:
+ len = left;
+ req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+ ceph_vino(inode), pos, &len,
+ CEPH_OSD_OP_WRITE, flags,
+ ci->i_snap_realm->cached_context,
+ do_sync,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &mtime, false, 2);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ num_pages = calc_pages_for(pos, len);
+
+ if (file->f_flags & O_DIRECT) {
+ pages = get_direct_page_vector(data, num_pages, pos, len);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out;
+ }
+
+ /*
+ * throw out any page cache pages in this range. this
+ * may block.
+ */
+ truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+ } else {
+ pages = alloc_page_vector(num_pages);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out;
+ }
+ ret = copy_user_to_page_vector(pages, data, pos, len);
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ goto out;
+ }
+
+ if ((file->f_flags & O_SYNC) == 0) {
+ /* get a second commit callback */
+ req->r_safe_callback = sync_write_commit;
+ req->r_own_pages = 1;
+ }
+ }
+ req->r_pages = pages;
+ req->r_num_pages = num_pages;
+ req->r_inode = inode;
+
+ ret = ceph_osdc_start_request(&client->osdc, req, false);
+ if (!ret) {
+ if (req->r_safe_callback) {
+ /*
+ * Add to inode unsafe list only after we
+ * start_request so that a tid has been assigned.
+ */
+ spin_lock(&ci->i_unsafe_lock);
+ list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+ }
+ ret = ceph_osdc_wait_request(&client->osdc, req);
+ }
+
+ if (file->f_flags & O_DIRECT)
+ put_page_vector(pages, num_pages);
+ else if (file->f_flags & O_SYNC)
+ ceph_release_page_vector(pages, num_pages);
+
+out:
+ ceph_osdc_put_request(req);
+ if (ret == 0) {
+ pos += len;
+ written += len;
+ left -= len;
+ if (left)
+ goto more;
+
+ ret = written;
+ *offset = pos;
+ if (pos > i_size_read(inode))
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *filp = iocb->ki_filp;
+ loff_t *ppos = &iocb->ki_pos;
+ size_t len = iov->iov_len;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ ssize_t ret;
+ int got = 0;
+
+ dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+ inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+ __ceph_do_pending_vmtruncate(inode);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+ &got, -1);
+ if (ret < 0)
+ goto out;
+ dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)len,
+ ceph_cap_string(got));
+
+ if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+ (iocb->ki_filp->f_flags & O_DIRECT) ||
+ (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+ /* hmm, this isn't really async... */
+ ret = ceph_sync_read(filp, iov->iov_base, len, ppos);
+ else
+ ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+ dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ ceph_put_cap_refs(ci, got);
+ return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC. In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+ loff_t endoff = pos + iov->iov_len;
+ int got = 0;
+ int ret;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+retry_snap:
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+ return -ENOSPC;
+ __ceph_do_pending_vmtruncate(inode);
+ dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ inode->i_size);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+ &got, endoff);
+ if (ret < 0)
+ goto out;
+
+ dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ ceph_cap_string(got));
+
+ if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+ (iocb->ki_filp->f_flags & O_DIRECT) ||
+ (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+ ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+ &iocb->ki_pos);
+ } else {
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+ ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+ || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)))
+ ret = vfs_fsync_range(file, file->f_path.dentry,
+ pos, pos + ret - 1, 1);
+ }
+ if (ret >= 0) {
+ spin_lock(&inode->i_lock);
+ __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&inode->i_lock);
+ }
+
+out:
+ dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+
+ if (ret == -EOLDSNAPC) {
+ dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+ goto retry_snap;
+ }
+
+ return ret;
+}
+
+/*
+ * llseek. be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ switch (origin) {
+ case SEEK_END:
+ ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ if (ret < 0) {
+ offset = ret;
+ goto out;
+ }
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ }
+
+ if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+ offset = -EINVAL;
+ goto out;
+ }
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+const struct file_operations ceph_file_fops = {
+ .open = ceph_open,
+ .release = ceph_release,
+ .llseek = ceph_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = ceph_aio_read,
+ .aio_write = ceph_aio_write,
+ .mmap = ceph_mmap,
+ .fsync = ceph_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+ .unlocked_ioctl = ceph_ioctl,
+ .compat_ioctl = ceph_ioctl,
+};
+
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..074ee42bd344
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1624 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_inode_invalidate_pages(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+ struct inode *inode;
+ ino_t t = ceph_vino_to_ino(vino);
+
+ inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (inode == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (inode->i_state & I_NEW) {
+ dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+ inode, ceph_vinop(inode), (u64)inode->i_ino);
+ unlock_new_inode(inode);
+ }
+
+ dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+ vino.snap, inode);
+ return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+ struct ceph_vino vino = {
+ .ino = ceph_ino(parent),
+ .snap = CEPH_SNAPDIR,
+ };
+ struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ BUG_ON(!S_ISDIR(parent->i_mode));
+ if (IS_ERR(inode))
+ return ERR_PTR(PTR_ERR(inode));
+ inode->i_mode = parent->i_mode;
+ inode->i_uid = parent->i_uid;
+ inode->i_gid = parent->i_gid;
+ inode->i_op = &ceph_dir_iops;
+ inode->i_fop = &ceph_dir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ ci->i_rbytes = 0;
+ return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+ .permission = ceph_permission,
+ .setattr = ceph_setattr,
+ .getattr = ceph_getattr,
+ .setxattr = ceph_setxattr,
+ .getxattr = ceph_getxattr,
+ .listxattr = ceph_listxattr,
+ .removexattr = ceph_removexattr,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment). We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+ u32 f)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_frag *frag;
+ int c;
+
+ p = &ci->i_fragtree.rb_node;
+ while (*p) {
+ parent = *p;
+ frag = rb_entry(parent, struct ceph_inode_frag, node);
+ c = ceph_frag_compare(f, frag->frag);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return frag;
+ }
+
+ frag = kmalloc(sizeof(*frag), GFP_NOFS);
+ if (!frag) {
+ pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+ "frag %x\n", &ci->vfs_inode,
+ ceph_vinop(&ci->vfs_inode), f);
+ return ERR_PTR(-ENOMEM);
+ }
+ frag->frag = f;
+ frag->split_by = 0;
+ frag->mds = -1;
+ frag->ndist = 0;
+
+ rb_link_node(&frag->node, parent, p);
+ rb_insert_color(&frag->node, &ci->i_fragtree);
+
+ dout("get_or_create_frag added %llx.%llx frag %x\n",
+ ceph_vinop(&ci->vfs_inode), f);
+ return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+ struct rb_node *n = ci->i_fragtree.rb_node;
+
+ while (n) {
+ struct ceph_inode_frag *frag =
+ rb_entry(n, struct ceph_inode_frag, node);
+ int c = ceph_frag_compare(f, frag->frag);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else
+ return frag;
+ }
+ return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v. If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag,
+ int *found)
+{
+ u32 t = ceph_frag_make(0, 0);
+ struct ceph_inode_frag *frag;
+ unsigned nway, i;
+ u32 n;
+
+ if (found)
+ *found = 0;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ while (1) {
+ WARN_ON(!ceph_frag_contains_value(t, v));
+ frag = __ceph_find_frag(ci, t);
+ if (!frag)
+ break; /* t is a leaf */
+ if (frag->split_by == 0) {
+ if (pfrag)
+ memcpy(pfrag, frag, sizeof(*pfrag));
+ if (found)
+ *found = 1;
+ break;
+ }
+
+ /* choose child */
+ nway = 1 << frag->split_by;
+ dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+ frag->split_by, nway);
+ for (i = 0; i < nway; i++) {
+ n = ceph_frag_make_child(t, frag->split_by, i);
+ if (ceph_frag_contains_value(n, v)) {
+ t = n;
+ break;
+ }
+ }
+ BUG_ON(i == nway);
+ }
+ dout("choose_frag(%x) = %x\n", v, t);
+
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return t;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds. Include leaf
+ * fragment in tree ONLY if ndist > 0. Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ u32 id = le32_to_cpu(dirinfo->frag);
+ int mds = le32_to_cpu(dirinfo->auth);
+ int ndist = le32_to_cpu(dirinfo->ndist);
+ int i;
+ int err = 0;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ if (ndist == 0) {
+ /* no delegation info needed. */
+ frag = __ceph_find_frag(ci, id);
+ if (!frag)
+ goto out;
+ if (frag->split_by == 0) {
+ /* tree leaf, remove */
+ dout("fill_dirfrag removed %llx.%llx frag %x"
+ " (no ref)\n", ceph_vinop(inode), id);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ } else {
+ /* tree branch, keep and clear */
+ dout("fill_dirfrag cleared %llx.%llx frag %x"
+ " referral\n", ceph_vinop(inode), id);
+ frag->mds = -1;
+ frag->ndist = 0;
+ }
+ goto out;
+ }
+
+
+ /* find/add this frag to store mds delegation info */
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag)) {
+ /* this is not the end of the world; we can continue
+ with bad/inaccurate delegation info */
+ pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+ ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+ err = -ENOMEM;
+ goto out;
+ }
+
+ frag->mds = mds;
+ frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+ for (i = 0; i < frag->ndist; i++)
+ frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+ dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+ ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return err;
+}
+
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+ struct ceph_inode_info *ci;
+ int i;
+
+ ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ if (!ci)
+ return NULL;
+
+ dout("alloc_inode %p\n", &ci->vfs_inode);
+
+ ci->i_version = 0;
+ ci->i_time_warp_seq = 0;
+ ci->i_ceph_flags = 0;
+ ci->i_release_count = 0;
+ ci->i_symlink = NULL;
+
+ ci->i_fragtree = RB_ROOT;
+ mutex_init(&ci->i_fragtree_mutex);
+
+ ci->i_xattrs.blob = NULL;
+ ci->i_xattrs.prealloc_blob = NULL;
+ ci->i_xattrs.dirty = false;
+ ci->i_xattrs.index = RB_ROOT;
+ ci->i_xattrs.count = 0;
+ ci->i_xattrs.names_size = 0;
+ ci->i_xattrs.vals_size = 0;
+ ci->i_xattrs.version = 0;
+ ci->i_xattrs.index_version = 0;
+
+ ci->i_caps = RB_ROOT;
+ ci->i_auth_cap = NULL;
+ ci->i_dirty_caps = 0;
+ ci->i_flushing_caps = 0;
+ INIT_LIST_HEAD(&ci->i_dirty_item);
+ INIT_LIST_HEAD(&ci->i_flushing_item);
+ ci->i_cap_flush_seq = 0;
+ ci->i_cap_flush_last_tid = 0;
+ memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+ init_waitqueue_head(&ci->i_cap_wq);
+ ci->i_hold_caps_min = 0;
+ ci->i_hold_caps_max = 0;
+ INIT_LIST_HEAD(&ci->i_cap_delay_list);
+ ci->i_cap_exporting_mds = 0;
+ ci->i_cap_exporting_mseq = 0;
+ ci->i_cap_exporting_issued = 0;
+ INIT_LIST_HEAD(&ci->i_cap_snaps);
+ ci->i_head_snapc = NULL;
+ ci->i_snap_caps = 0;
+
+ for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ ci->i_nr_by_mode[i] = 0;
+
+ ci->i_truncate_seq = 0;
+ ci->i_truncate_size = 0;
+ ci->i_truncate_pending = 0;
+
+ ci->i_max_size = 0;
+ ci->i_reported_size = 0;
+ ci->i_wanted_max_size = 0;
+ ci->i_requested_max_size = 0;
+
+ ci->i_pin_ref = 0;
+ ci->i_rd_ref = 0;
+ ci->i_rdcache_ref = 0;
+ ci->i_wr_ref = 0;
+ ci->i_wrbuffer_ref = 0;
+ ci->i_wrbuffer_ref_head = 0;
+ ci->i_shared_gen = 0;
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+
+ INIT_LIST_HEAD(&ci->i_unsafe_writes);
+ INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ spin_lock_init(&ci->i_unsafe_lock);
+
+ ci->i_snap_realm = NULL;
+ INIT_LIST_HEAD(&ci->i_snap_realm_item);
+ INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+ INIT_WORK(&ci->i_wb_work, ceph_inode_writeback);
+ INIT_WORK(&ci->i_pg_inv_work, ceph_inode_invalidate_pages);
+
+ INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+ return &ci->vfs_inode;
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ struct rb_node *n;
+
+ dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+ ceph_queue_caps_release(inode);
+
+ kfree(ci->i_symlink);
+ while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+ frag = rb_entry(n, struct ceph_inode_frag, node);
+ rb_erase(n, &ci->i_fragtree);
+ kfree(frag);
+ }
+
+ __ceph_destroy_xattrs(ci);
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+ kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime. We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+ u32 truncate_seq, u64 truncate_size, u64 size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int queue_trunc = 0;
+
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+ (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+ dout("size %lld -> %llu\n", inode->i_size, size);
+ inode->i_size = size;
+ inode->i_blocks = (size + (1<<9) - 1) >> 9;
+ ci->i_reported_size = size;
+ if (truncate_seq != ci->i_truncate_seq) {
+ dout("truncate_seq %u -> %u\n",
+ ci->i_truncate_seq, truncate_seq);
+ ci->i_truncate_seq = truncate_seq;
+ if (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+ CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+ CEPH_CAP_FILE_EXCL)) {
+ ci->i_truncate_pending++;
+ queue_trunc = 1;
+ }
+ }
+ }
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+ ci->i_truncate_size != truncate_size) {
+ dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+ truncate_size);
+ ci->i_truncate_size = truncate_size;
+ }
+ return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+ u64 time_warp_seq, struct timespec *ctime,
+ struct timespec *mtime, struct timespec *atime)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int warn = 0;
+
+ if (issued & (CEPH_CAP_FILE_EXCL|
+ CEPH_CAP_FILE_WR|
+ CEPH_CAP_FILE_BUFFER)) {
+ if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+ dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+ inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ ctime->tv_sec, ctime->tv_nsec);
+ inode->i_ctime = *ctime;
+ }
+ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+ /* the MDS did a utimes() */
+ dout("mtime %ld.%09ld -> %ld.%09ld "
+ "tw %d -> %d\n",
+ inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ mtime->tv_sec, mtime->tv_nsec,
+ ci->i_time_warp_seq, (int)time_warp_seq);
+
+ inode->i_mtime = *mtime;
+ inode->i_atime = *atime;
+ ci->i_time_warp_seq = time_warp_seq;
+ } else if (time_warp_seq == ci->i_time_warp_seq) {
+ /* nobody did utimes(); take the max */
+ if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+ dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+ inode->i_mtime.tv_sec,
+ inode->i_mtime.tv_nsec,
+ mtime->tv_sec, mtime->tv_nsec);
+ inode->i_mtime = *mtime;
+ }
+ if (timespec_compare(atime, &inode->i_atime) > 0) {
+ dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+ inode->i_atime.tv_sec,
+ inode->i_atime.tv_nsec,
+ atime->tv_sec, atime->tv_nsec);
+ inode->i_atime = *atime;
+ }
+ } else if (issued & CEPH_CAP_FILE_EXCL) {
+ /* we did a utimes(); ignore mds values */
+ } else {
+ warn = 1;
+ }
+ } else {
+ /* we have no write caps; whatever the MDS says is true */
+ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+ inode->i_ctime = *ctime;
+ inode->i_mtime = *mtime;
+ inode->i_atime = *atime;
+ ci->i_time_warp_seq = time_warp_seq;
+ } else {
+ warn = 1;
+ }
+ }
+ if (warn) /* time_warp_seq shouldn't go backwards */
+ dout("%p mds time_warp_seq %llu < %u\n",
+ inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds. May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session,
+ unsigned long ttl_from, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
+{
+ struct ceph_mds_reply_inode *info = iinfo->in;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int i;
+ int issued, implemented;
+ struct timespec mtime, atime, ctime;
+ u32 nsplits;
+ struct ceph_buffer *xattr_blob = NULL;
+ int err = 0;
+ int queue_trunc = 0;
+
+ dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ inode, ceph_vinop(inode), le64_to_cpu(info->version),
+ ci->i_version);
+
+ /*
+ * prealloc xattr data, if it looks like we'll need it. only
+ * if len > 4 (meaning there are actually xattrs; the first 4
+ * bytes are the xattr count).
+ */
+ if (iinfo->xattr_len > 4) {
+ xattr_blob = ceph_buffer_new_alloc(iinfo->xattr_len, GFP_NOFS);
+ if (!xattr_blob)
+ pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ iinfo->xattr_len);
+ }
+
+ spin_lock(&inode->i_lock);
+
+ /*
+ * provided version will be odd if inode value is projected,
+ * even if stable. skip the update if we have a newer info
+ * (e.g., due to inode info racing form multiple MDSs), or if
+ * we are getting projected (unstable) inode info.
+ */
+ if (le64_to_cpu(info->version) > 0 &&
+ (ci->i_version & ~1) > le64_to_cpu(info->version))
+ goto no_change;
+
+ issued = __ceph_caps_issued(ci, &implemented);
+ issued |= implemented | __ceph_caps_dirty(ci);
+
+ /* update inode */
+ ci->i_version = le64_to_cpu(info->version);
+ inode->i_version++;
+ inode->i_rdev = le32_to_cpu(info->rdev);
+
+ if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = le32_to_cpu(info->mode);
+ inode->i_uid = le32_to_cpu(info->uid);
+ inode->i_gid = le32_to_cpu(info->gid);
+ dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+ inode->i_uid, inode->i_gid);
+ }
+
+ if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ inode->i_nlink = le32_to_cpu(info->nlink);
+
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec(&atime, &info->atime);
+ ceph_decode_timespec(&mtime, &info->mtime);
+ ceph_decode_timespec(&ctime, &info->ctime);
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ le64_to_cpu(info->size));
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
+
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ ci->i_layout = info->layout;
+ inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+ /* xattrs */
+ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+ if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+ if (ci->i_xattrs.blob)
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ci->i_xattrs.blob = xattr_blob;
+ if (xattr_blob)
+ memcpy(ci->i_xattrs.blob->vec.iov_base,
+ iinfo->xattr_data, iinfo->xattr_len);
+ ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ }
+
+ inode->i_mapping->a_ops = &ceph_aops;
+ inode->i_mapping->backing_dev_info =
+ &ceph_client(inode->i_sb)->backing_dev_info;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFSOCK:
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ inode->i_op = &ceph_file_iops;
+ break;
+ case S_IFREG:
+ inode->i_op = &ceph_file_iops;
+ inode->i_fop = &ceph_file_fops;
+ break;
+ case S_IFLNK:
+ inode->i_op = &ceph_symlink_iops;
+ if (!ci->i_symlink) {
+ int symlen = iinfo->symlink_len;
+ char *sym;
+
+ BUG_ON(symlen != inode->i_size);
+ spin_unlock(&inode->i_lock);
+
+ err = -ENOMEM;
+ sym = kmalloc(symlen+1, GFP_NOFS);
+ if (!sym)
+ goto out;
+ memcpy(sym, iinfo->symlink, symlen);
+ sym[symlen] = 0;
+
+ spin_lock(&inode->i_lock);
+ if (!ci->i_symlink)
+ ci->i_symlink = sym;
+ else
+ kfree(sym); /* lost a race */
+ }
+ break;
+ case S_IFDIR:
+ inode->i_op = &ceph_dir_iops;
+ inode->i_fop = &ceph_dir_fops;
+
+ ci->i_files = le64_to_cpu(info->files);
+ ci->i_subdirs = le64_to_cpu(info->subdirs);
+ ci->i_rbytes = le64_to_cpu(info->rbytes);
+ ci->i_rfiles = le64_to_cpu(info->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+
+ /* set dir completion flag? */
+ if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+ ceph_snap(inode) == CEPH_NOSNAP &&
+ (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+ dout(" marking %p complete (empty)\n", inode);
+ ci->i_ceph_flags |= CEPH_I_COMPLETE;
+ ci->i_max_offset = 2;
+ }
+
+ /* it may be better to set st_size in getattr instead? */
+ if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+ inode->i_size = ci->i_rbytes;
+ break;
+ default:
+ pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ ceph_vinop(inode), inode->i_mode);
+ }
+
+no_change:
+ spin_unlock(&inode->i_lock);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+ &ci->i_vmtruncate_work))
+ igrab(inode);
+
+ /* populate frag tree */
+ /* FIXME: move me up, if/when version reflects fragtree changes */
+ nsplits = le32_to_cpu(info->fragtree.nsplits);
+ mutex_lock(&ci->i_fragtree_mutex);
+ for (i = 0; i < nsplits; i++) {
+ u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+ struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+
+ if (IS_ERR(frag))
+ continue;
+ frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+ dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ }
+ mutex_unlock(&ci->i_fragtree_mutex);
+
+ /* were we issued a capability? */
+ if (info->cap.caps) {
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ ceph_add_cap(inode, session,
+ le64_to_cpu(info->cap.cap_id),
+ cap_fmode,
+ le32_to_cpu(info->cap.caps),
+ le32_to_cpu(info->cap.wanted),
+ le32_to_cpu(info->cap.seq),
+ le32_to_cpu(info->cap.mseq),
+ le64_to_cpu(info->cap.realm),
+ info->cap.flags,
+ caps_reservation);
+ } else {
+ spin_lock(&inode->i_lock);
+ dout(" %p got snap_caps %s\n", inode,
+ ceph_cap_string(le32_to_cpu(info->cap.caps)));
+ ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+ if (cap_fmode >= 0)
+ __ceph_get_fmode(ci, cap_fmode);
+ spin_unlock(&inode->i_lock);
+ }
+ }
+
+ /* update delegation info? */
+ if (dirinfo)
+ ceph_fill_dirfrag(inode, dirinfo);
+
+ err = 0;
+
+out:
+ ceph_buffer_put(xattr_blob);
+ return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ long unsigned duration = le32_to_cpu(lease->duration_ms);
+ long unsigned ttl = from_time + (duration * HZ) / 1000;
+ long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+ struct inode *dir;
+
+ /* only track leases on regular dentries */
+ if (dentry->d_op != &ceph_dentry_ops)
+ return;
+
+ spin_lock(&dentry->d_lock);
+ dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+ dentry, le16_to_cpu(lease->mask), duration, ttl);
+
+ /* make lease_rdcache_gen match directory */
+ dir = dentry->d_parent->d_inode;
+ di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+ if (lease->mask == 0)
+ goto out_unlock;
+
+ if (di->lease_gen == session->s_cap_gen &&
+ time_before(ttl, dentry->d_time))
+ goto out_unlock; /* we already have a newer lease. */
+
+ if (di->lease_session && di->lease_session != session)
+ goto out_unlock;
+
+ ceph_dentry_lru_touch(dentry);
+
+ if (!di->lease_session)
+ di->lease_session = ceph_get_mds_session(session);
+ di->lease_gen = session->s_cap_gen;
+ di->lease_seq = le32_to_cpu(lease->seq);
+ di->lease_renew_after = half_ttl;
+ di->lease_renew_from = 0;
+ dentry->d_time = ttl;
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ return;
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+ bool *prehash)
+{
+ struct dentry *realdn;
+
+ /* dn must be unhashed */
+ if (!d_unhashed(dn))
+ d_drop(dn);
+ realdn = d_materialise_unique(dn, in);
+ if (IS_ERR(realdn)) {
+ pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+ dn, in, ceph_vinop(in));
+ if (prehash)
+ *prehash = false; /* don't rehash on error */
+ dn = realdn; /* note realdn contains the error */
+ goto out;
+ } else if (realdn) {
+ dout("dn %p (%d) spliced with %p (%d) "
+ "inode %p ino %llx.%llx\n",
+ dn, atomic_read(&dn->d_count),
+ realdn, atomic_read(&realdn->d_count),
+ realdn->d_inode, ceph_vinop(realdn->d_inode));
+ dput(dn);
+ dn = realdn;
+ } else {
+ BUG_ON(!ceph_dentry(dn));
+
+ dout("dn %p attached to %p ino %llx.%llx\n",
+ dn, dn->d_inode, ceph_vinop(dn->d_inode));
+ }
+ if ((!prehash || *prehash) && d_unhashed(dn))
+ d_rehash(dn);
+out:
+ return dn;
+}
+
+/*
+ * Incorporate results into the local cache. This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ * a directory inode along with a dentry.
+ * and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct inode *in = NULL;
+ struct ceph_mds_reply_inode *ininfo;
+ struct ceph_vino vino;
+ int i = 0;
+ int err = 0;
+
+ dout("fill_trace %p is_dentry %d is_target %d\n", req,
+ rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+ /*
+ * Debugging hook:
+ *
+ * If we resend completed ops to a recovering mds, we get no
+ * trace. Since that is very rare, pretend this is the case
+ * to ensure the 'no trace' handlers in the callers behave.
+ *
+ * Fill in inodes unconditionally to avoid breaking cap
+ * invariants.
+ */
+ if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+ pr_info("fill_trace faking empty trace on %lld %s\n",
+ req->r_tid, ceph_mds_op_name(rinfo->head->op));
+ if (rinfo->head->is_dentry) {
+ rinfo->head->is_dentry = 0;
+ err = fill_inode(req->r_locked_dir,
+ &rinfo->diri, rinfo->dirfrag,
+ session, req->r_request_started, -1);
+ }
+ if (rinfo->head->is_target) {
+ rinfo->head->is_target = 0;
+ ininfo = rinfo->targeti.in;
+ vino.ino = le64_to_cpu(ininfo->ino);
+ vino.snap = le64_to_cpu(ininfo->snapid);
+ in = ceph_get_inode(sb, vino);
+ err = fill_inode(in, &rinfo->targeti, NULL,
+ session, req->r_request_started,
+ req->r_fmode);
+ iput(in);
+ }
+ }
+#endif
+
+ if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+ dout("fill_trace reply is empty!\n");
+ if (rinfo->head->result == 0 && req->r_locked_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_locked_dir);
+ dout(" clearing %p complete (empty trace)\n",
+ req->r_locked_dir);
+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ ci->i_release_count++;
+ }
+ return 0;
+ }
+
+ if (rinfo->head->is_dentry) {
+ /*
+ * lookup link rename : null -> possibly existing inode
+ * mknod symlink mkdir : null -> new inode
+ * unlink : linked -> null
+ */
+ struct inode *dir = req->r_locked_dir;
+ struct dentry *dn = req->r_dentry;
+ bool have_dir_cap, have_lease;
+
+ BUG_ON(!dn);
+ BUG_ON(!dir);
+ BUG_ON(dn->d_parent->d_inode != dir);
+ BUG_ON(ceph_ino(dir) !=
+ le64_to_cpu(rinfo->diri.in->ino));
+ BUG_ON(ceph_snap(dir) !=
+ le64_to_cpu(rinfo->diri.in->snapid));
+
+ err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+ session, req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (err < 0)
+ return err;
+
+ /* do we have a lease on the whole dir? */
+ have_dir_cap =
+ (le32_to_cpu(rinfo->diri.in->cap.caps) &
+ CEPH_CAP_FILE_SHARED);
+
+ /* do we have a dn lease? */
+ have_lease = have_dir_cap ||
+ (le16_to_cpu(rinfo->dlease->mask) &
+ CEPH_LOCK_DN);
+
+ if (!have_lease)
+ dout("fill_trace no dentry lease or dir cap\n");
+
+ /* rename? */
+ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+ dout(" src %p '%.*s' dst %p '%.*s'\n",
+ req->r_old_dentry,
+ req->r_old_dentry->d_name.len,
+ req->r_old_dentry->d_name.name,
+ dn, dn->d_name.len, dn->d_name.name);
+ dout("fill_trace doing d_move %p -> %p\n",
+ req->r_old_dentry, dn);
+ d_move(req->r_old_dentry, dn);
+ dout(" src %p '%.*s' dst %p '%.*s'\n",
+ req->r_old_dentry,
+ req->r_old_dentry->d_name.len,
+ req->r_old_dentry->d_name.name,
+ dn, dn->d_name.len, dn->d_name.name);
+ /* take overwritten dentry's readdir offset */
+ ceph_dentry(req->r_old_dentry)->offset =
+ ceph_dentry(dn)->offset;
+ dn = req->r_old_dentry; /* use old_dentry */
+ in = dn->d_inode;
+ }
+
+ /* null dentry? */
+ if (!rinfo->head->is_target) {
+ dout("fill_trace null dentry\n");
+ if (dn->d_inode) {
+ dout("d_delete %p\n", dn);
+ d_delete(dn);
+ } else {
+ dout("d_instantiate %p NULL\n", dn);
+ d_instantiate(dn, NULL);
+ if (have_lease && d_unhashed(dn))
+ d_rehash(dn);
+ update_dentry_lease(dn, rinfo->dlease,
+ session,
+ req->r_request_started);
+ }
+ goto done;
+ }
+
+ /* attach proper inode */
+ ininfo = rinfo->targeti.in;
+ vino.ino = le64_to_cpu(ininfo->ino);
+ vino.snap = le64_to_cpu(ininfo->snapid);
+ if (!dn->d_inode) {
+ in = ceph_get_inode(sb, vino);
+ if (IS_ERR(in)) {
+ pr_err("fill_trace bad get_inode "
+ "%llx.%llx\n", vino.ino, vino.snap);
+ err = PTR_ERR(in);
+ d_delete(dn);
+ goto done;
+ }
+ dn = splice_dentry(dn, in, &have_lease);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ goto done;
+ }
+ req->r_dentry = dn; /* may have spliced */
+ igrab(in);
+ } else if (ceph_ino(in) == vino.ino &&
+ ceph_snap(in) == vino.snap) {
+ igrab(in);
+ } else {
+ dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+ dn, in, ceph_ino(in), ceph_snap(in),
+ vino.ino, vino.snap);
+ have_lease = false;
+ in = NULL;
+ }
+
+ if (have_lease)
+ update_dentry_lease(dn, rinfo->dlease, session,
+ req->r_request_started);
+ dout(" final dn %p\n", dn);
+ i++;
+ } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP) {
+ struct dentry *dn = req->r_dentry;
+
+ /* fill out a snapdir LOOKUPSNAP dentry */
+ BUG_ON(!dn);
+ BUG_ON(!req->r_locked_dir);
+ BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+ ininfo = rinfo->targeti.in;
+ vino.ino = le64_to_cpu(ininfo->ino);
+ vino.snap = le64_to_cpu(ininfo->snapid);
+ in = ceph_get_inode(sb, vino);
+ if (IS_ERR(in)) {
+ pr_err("fill_inode get_inode badness %llx.%llx\n",
+ vino.ino, vino.snap);
+ err = PTR_ERR(in);
+ d_delete(dn);
+ goto done;
+ }
+ dout(" linking snapped dir %p to dn %p\n", in, dn);
+ dn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ goto done;
+ }
+ req->r_dentry = dn; /* may have spliced */
+ igrab(in);
+ rinfo->head->is_dentry = 1; /* fool notrace handlers */
+ }
+
+ if (rinfo->head->is_target) {
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+ if (in == NULL || ceph_ino(in) != vino.ino ||
+ ceph_snap(in) != vino.snap) {
+ in = ceph_get_inode(sb, vino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto done;
+ }
+ }
+ req->r_target_inode = in;
+
+ err = fill_inode(in,
+ &rinfo->targeti, NULL,
+ session, req->r_request_started,
+ (le32_to_cpu(rinfo->head->result) == 0) ?
+ req->r_fmode : -1,
+ &req->r_caps_reservation);
+ if (err < 0) {
+ pr_err("fill_inode badness %p %llx.%llx\n",
+ in, ceph_vinop(in));
+ goto done;
+ }
+ }
+
+done:
+ dout("fill_trace done err=%d\n", err);
+ return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+ struct ceph_mds_session *session)
+{
+ struct dentry *parent = req->r_dentry;
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct qstr dname;
+ struct dentry *dn;
+ struct inode *in;
+ int err = 0, i;
+ struct inode *snapdir = NULL;
+ struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+ u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+ struct ceph_dentry_info *di;
+
+ if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+ snapdir = ceph_get_snapdir(parent->d_inode);
+ parent = d_find_alias(snapdir);
+ dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+ rinfo->dir_nr, parent);
+ } else {
+ dout("readdir_prepopulate %d items under dn %p\n",
+ rinfo->dir_nr, parent);
+ if (rinfo->dir_dir)
+ ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+ }
+
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_vino vino;
+
+ dname.name = rinfo->dir_dname[i];
+ dname.len = rinfo->dir_dname_len[i];
+ dname.hash = full_name_hash(dname.name, dname.len);
+
+ vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+ vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (dn == NULL) {
+ dout("d_alloc badness\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ err = ceph_init_dentry(dn);
+ if (err < 0)
+ goto out;
+ } else if (dn->d_inode &&
+ (ceph_ino(dn->d_inode) != vino.ino ||
+ ceph_snap(dn->d_inode) != vino.snap)) {
+ dout(" dn %p points to wrong inode %p\n",
+ dn, dn->d_inode);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ } else {
+ /* reorder parent's d_subdirs */
+ spin_lock(&dcache_lock);
+ spin_lock(&dn->d_lock);
+ list_move(&dn->d_u.d_child, &parent->d_subdirs);
+ spin_unlock(&dn->d_lock);
+ spin_unlock(&dcache_lock);
+ }
+
+ di = dn->d_fsdata;
+ di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+
+ /* inode */
+ if (dn->d_inode) {
+ in = dn->d_inode;
+ } else {
+ in = ceph_get_inode(parent->d_sb, vino);
+ if (in == NULL) {
+ dout("new_inode badness\n");
+ d_delete(dn);
+ dput(dn);
+ err = -ENOMEM;
+ goto out;
+ }
+ dn = splice_dentry(dn, in, NULL);
+ }
+
+ if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation) < 0) {
+ pr_err("fill_inode badness on %p\n", in);
+ dput(dn);
+ continue;
+ }
+ update_dentry_lease(dn, rinfo->dir_dlease[i],
+ req->r_session, req->r_request_started);
+ dput(dn);
+ }
+ req->r_did_prepopulate = true;
+
+out:
+ if (snapdir) {
+ iput(snapdir);
+ dput(parent);
+ }
+ dout("readdir_prepopulate done\n");
+ return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+ inode->i_size = size;
+ inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+ /* tell the MDS if we are approaching max_size */
+ if ((size << 1) >= ci->i_max_size &&
+ (ci->i_reported_size << 1) < ci->i_max_size)
+ ret = 1;
+
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/*
+ * Write back inode data in a worker thread. (This can't be done
+ * in the message handler context.)
+ */
+void ceph_inode_writeback(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_wb_work);
+ struct inode *inode = &ci->vfs_inode;
+
+ dout("writeback %p\n", inode);
+ filemap_fdatawrite(&inode->i_data);
+ iput(inode);
+}
+
+/*
+ * Invalidate inode pages in a worker thread. (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_inode_invalidate_pages(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_pg_inv_work);
+ struct inode *inode = &ci->vfs_inode;
+ u32 orig_gen;
+ int check = 0;
+
+ spin_lock(&inode->i_lock);
+ dout("invalidate_pages %p gen %d revoking %d\n", inode,
+ ci->i_rdcache_gen, ci->i_rdcache_revoking);
+ if (ci->i_rdcache_gen == 0 ||
+ ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+ BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+ /* nevermind! */
+ ci->i_rdcache_revoking = 0;
+ spin_unlock(&inode->i_lock);
+ goto out;
+ }
+ orig_gen = ci->i_rdcache_gen;
+ spin_unlock(&inode->i_lock);
+
+ truncate_inode_pages(&inode->i_data, 0);
+
+ spin_lock(&inode->i_lock);
+ if (orig_gen == ci->i_rdcache_gen) {
+ dout("invalidate_pages %p gen %d successful\n", inode,
+ ci->i_rdcache_gen);
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+ check = 1;
+ } else {
+ dout("invalidate_pages %p gen %d raced, gen now %d\n",
+ inode, orig_gen, ci->i_rdcache_gen);
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (check)
+ ceph_check_caps(ci, 0, NULL);
+out:
+ iput(inode);
+}
+
+
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+void ceph_vmtruncate_work(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_vmtruncate_work);
+ struct inode *inode = &ci->vfs_inode;
+
+ dout("vmtruncate_work %p\n", inode);
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+}
+
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 to;
+ int wrbuffer_refs, wake = 0;
+
+retry:
+ spin_lock(&inode->i_lock);
+ if (ci->i_truncate_pending == 0) {
+ dout("__do_pending_vmtruncate %p none pending\n", inode);
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ /*
+ * make sure any dirty snapped pages are flushed before we
+ * possibly truncate them.. so write AND block!
+ */
+ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+ dout("__do_pending_vmtruncate %p flushing snaps first\n",
+ inode);
+ spin_unlock(&inode->i_lock);
+ filemap_write_and_wait_range(&inode->i_data, 0,
+ inode->i_sb->s_maxbytes);
+ goto retry;
+ }
+
+ to = ci->i_truncate_size;
+ wrbuffer_refs = ci->i_wrbuffer_ref;
+ dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+ ci->i_truncate_pending, to);
+ spin_unlock(&inode->i_lock);
+
+ truncate_inode_pages(inode->i_mapping, to);
+
+ spin_lock(&inode->i_lock);
+ ci->i_truncate_pending--;
+ if (ci->i_truncate_pending == 0)
+ wake = 1;
+ spin_unlock(&inode->i_lock);
+
+ if (wrbuffer_refs == 0)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+}
+
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+ nd_set_link(nd, ci->i_symlink);
+ return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+ .readlink = generic_readlink,
+ .follow_link = ceph_sym_follow_link,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct inode *parent_inode = dentry->d_parent->d_inode;
+ const unsigned int ia_valid = attr->ia_valid;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+ int issued;
+ int release = 0, dirtied = 0;
+ int mask = 0;
+ int err = 0;
+ int queue_trunc = 0;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ __ceph_do_pending_vmtruncate(inode);
+
+ err = inode_change_ok(inode, attr);
+ if (err != 0)
+ return err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+ USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ spin_lock(&inode->i_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+ if (ia_valid & ATTR_UID) {
+ dout("setattr %p uid %d -> %d\n", inode,
+ inode->i_uid, attr->ia_uid);
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_uid = attr->ia_uid;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ attr->ia_uid != inode->i_uid) {
+ req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+ mask |= CEPH_SETATTR_UID;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+ if (ia_valid & ATTR_GID) {
+ dout("setattr %p gid %d -> %d\n", inode,
+ inode->i_gid, attr->ia_gid);
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_gid = attr->ia_gid;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ attr->ia_gid != inode->i_gid) {
+ req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+ mask |= CEPH_SETATTR_GID;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+ if (ia_valid & ATTR_MODE) {
+ dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+ attr->ia_mode);
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ inode->i_mode = attr->ia_mode;
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ attr->ia_mode != inode->i_mode) {
+ req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+ mask |= CEPH_SETATTR_MODE;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ }
+
+ if (ia_valid & ATTR_ATIME) {
+ dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+ inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+ attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ if (issued & CEPH_CAP_FILE_EXCL) {
+ ci->i_time_warp_seq++;
+ inode->i_atime = attr->ia_atime;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_WR) &&
+ timespec_compare(&inode->i_atime,
+ &attr->ia_atime) < 0) {
+ inode->i_atime = attr->ia_atime;
+ dirtied |= CEPH_CAP_FILE_WR;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+ ceph_encode_timespec(&req->r_args.setattr.atime,
+ &attr->ia_atime);
+ mask |= CEPH_SETATTR_ATIME;
+ release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR;
+ }
+ }
+ if (ia_valid & ATTR_MTIME) {
+ dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+ inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ if (issued & CEPH_CAP_FILE_EXCL) {
+ ci->i_time_warp_seq++;
+ inode->i_mtime = attr->ia_mtime;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_WR) &&
+ timespec_compare(&inode->i_mtime,
+ &attr->ia_mtime) < 0) {
+ inode->i_mtime = attr->ia_mtime;
+ dirtied |= CEPH_CAP_FILE_WR;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+ ceph_encode_timespec(&req->r_args.setattr.mtime,
+ &attr->ia_mtime);
+ mask |= CEPH_SETATTR_MTIME;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR;
+ }
+ }
+ if (ia_valid & ATTR_SIZE) {
+ dout("setattr %p size %lld -> %lld\n", inode,
+ inode->i_size, attr->ia_size);
+ if (attr->ia_size > inode->i_sb->s_maxbytes) {
+ err = -EINVAL;
+ goto out;
+ }
+ if ((issued & CEPH_CAP_FILE_EXCL) &&
+ attr->ia_size > inode->i_size) {
+ inode->i_size = attr->ia_size;
+ if (attr->ia_size < inode->i_size) {
+ ci->i_truncate_size = attr->ia_size;
+ ci->i_truncate_pending++;
+ queue_trunc = 1;
+ }
+ inode->i_blocks =
+ (attr->ia_size + (1 << 9) - 1) >> 9;
+ inode->i_ctime = attr->ia_ctime;
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ attr->ia_size != inode->i_size) {
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+ req->r_args.setattr.old_size =
+ cpu_to_le64(inode->i_size);
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR;
+ }
+ }
+
+ /* these do nothing */
+ if (ia_valid & ATTR_CTIME) {
+ bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+ ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+ dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+ inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ only ? "ctime only" : "ignored");
+ inode->i_ctime = attr->ia_ctime;
+ if (only) {
+ /*
+ * if kernel wants to dirty ctime but nothing else,
+ * we need to choose a cap to dirty under, or do
+ * a almost-no-op setattr
+ */
+ if (issued & CEPH_CAP_AUTH_EXCL)
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ else if (issued & CEPH_CAP_FILE_EXCL)
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ else if (issued & CEPH_CAP_XATTR_EXCL)
+ dirtied |= CEPH_CAP_XATTR_EXCL;
+ else
+ mask |= CEPH_SETATTR_CTIME;
+ }
+ }
+ if (ia_valid & ATTR_FILE)
+ dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+ if (dirtied) {
+ __ceph_mark_dirty_caps(ci, dirtied);
+ inode->i_ctime = CURRENT_TIME;
+ }
+
+ release &= issued;
+ spin_unlock(&inode->i_lock);
+
+ if (queue_trunc)
+ __ceph_do_pending_vmtruncate(inode);
+
+ if (mask) {
+ req->r_inode = igrab(inode);
+ req->r_inode_drop = release;
+ req->r_args.setattr.mask = cpu_to_le32(mask);
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ }
+ dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+ ceph_cap_string(dirtied), mask);
+
+ ceph_mdsc_put_request(req);
+ __ceph_do_pending_vmtruncate(inode);
+ return err;
+out:
+ spin_unlock(&inode->i_lock);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask. If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+ struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ dout("do_getattr inode %p SNAPDIR\n", inode);
+ return 0;
+ }
+
+ dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+ if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+ return 0;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+ req->r_num_caps = 1;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+ dout("do_getattr result=%d\n", err);
+ return err;
+}
+
+
+/*
+ * Check inode permissions. We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+ int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+
+ if (!err)
+ err = generic_permission(inode, mask, NULL);
+ return err;
+}
+
+/*
+ * Get all attributes. Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err;
+
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+ if (!err) {
+ generic_fillattr(inode, stat);
+ stat->ino = inode->i_ino;
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ stat->dev = ceph_snap(inode);
+ else
+ stat->dev = 0;
+ if (S_ISDIR(inode->i_mode)) {
+ stat->size = ci->i_rbytes;
+ stat->blocks = 0;
+ stat->blksize = 65536;
+ }
+ }
+ return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..4c33e19fc241
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,157 @@
+#include <linux/in.h>
+
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_ioctl_layout l;
+ int err;
+
+ err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ if (!err) {
+ l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ l.object_size = ceph_file_layout_object_size(ci->i_layout);
+ l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ if (copy_to_user(arg, &l, sizeof(l)))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ int err, i;
+
+ /* copy and validate */
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ if ((l.object_size & ~PAGE_MASK) ||
+ (l.stripe_unit & ~PAGE_MASK) ||
+ !l.stripe_unit ||
+ (l.object_size &&
+ (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ if (l.data_pool > 0) {
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)
+ return err;
+ }
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+ USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+ req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+ req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32((s32)-1);
+
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+ struct ceph_ioctl_dataloc dl;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+ u64 len = 1, olen;
+ u64 tmp;
+ struct ceph_object_layout ol;
+ struct ceph_pg pgid;
+
+ /* copy and validate */
+ if (copy_from_user(&dl, arg, sizeof(dl)))
+ return -EFAULT;
+
+ down_read(&osdc->map_sem);
+ ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+ &dl.object_no, &dl.object_offset, &olen);
+ dl.file_offset -= dl.object_offset;
+ dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+ /* block_offset = object_offset % block_size */
+ tmp = dl.object_offset;
+ dl.block_offset = do_div(tmp, dl.block_size);
+
+ snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+ ceph_ino(inode), dl.object_no);
+ ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+ osdc->osdmap);
+
+ pgid = ol.ol_pgid;
+ dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+ if (dl.osd >= 0) {
+ struct ceph_entity_addr *a =
+ ceph_osd_addr(osdc->osdmap, dl.osd);
+ if (a)
+ memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+ } else {
+ memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+ }
+ up_read(&osdc->map_sem);
+
+ /* send result back to user */
+ if (copy_to_user(arg, &dl, sizeof(dl)))
+ return -EFAULT;
+
+ return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+ switch (cmd) {
+ case CEPH_IOC_GET_LAYOUT:
+ return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_SET_LAYOUT:
+ return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_GET_DATALOC:
+ return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+ }
+ return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..3c511dab3730
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,39 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+ __u64 stripe_unit, stripe_count, object_size;
+ __u64 data_pool;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
+ struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
+ struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+ __u64 file_offset; /* in+out: file offset */
+ __u64 object_offset; /* out: offset in object */
+ __u64 object_no; /* out: object # */
+ __u64 object_size; /* out: object size */
+ char object_name[64]; /* out: object name */
+ __u64 block_offset; /* out: offset in block */
+ __u64 block_size; /* out: block length */
+ __s64 osd; /* out: osd # */
+ struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
+ struct ceph_ioctl_dataloc)
+
+#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..7da836909abb
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,2976 @@
+#include "ceph_debug.h"
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage. Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid. If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head);
+
+const static struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ int err = -EIO;
+
+ info->in = *p;
+ *p += sizeof(struct ceph_mds_reply_inode) +
+ sizeof(*info->in->fragtree.splits) *
+ le32_to_cpu(info->in->fragtree.nsplits);
+
+ ceph_decode_32_safe(p, end, info->symlink_len, bad);
+ ceph_decode_need(p, end, info->symlink_len, bad);
+ info->symlink = *p;
+ *p += info->symlink_len;
+
+ ceph_decode_32_safe(p, end, info->xattr_len, bad);
+ ceph_decode_need(p, end, info->xattr_len, bad);
+ info->xattr_data = *p;
+ *p += info->xattr_len;
+ return 0;
+bad:
+ return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ int err;
+
+ if (info->head->is_dentry) {
+ err = parse_reply_info_in(p, end, &info->diri);
+ if (err < 0)
+ goto out_bad;
+
+ if (unlikely(*p + sizeof(*info->dirfrag) > end))
+ goto bad;
+ info->dirfrag = *p;
+ *p += sizeof(*info->dirfrag) +
+ sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+
+ ceph_decode_32_safe(p, end, info->dname_len, bad);
+ ceph_decode_need(p, end, info->dname_len, bad);
+ info->dname = *p;
+ *p += info->dname_len;
+ info->dlease = *p;
+ *p += sizeof(*info->dlease);
+ }
+
+ if (info->head->is_target) {
+ err = parse_reply_info_in(p, end, &info->targeti);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing mds trace %d\n", err);
+ return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ u32 num, i = 0;
+ int err;
+
+ info->dir_dir = *p;
+ if (*p + sizeof(*info->dir_dir) > end)
+ goto bad;
+ *p += sizeof(*info->dir_dir) +
+ sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+ if (*p > end)
+ goto bad;
+
+ ceph_decode_need(p, end, sizeof(num) + 2, bad);
+ num = ceph_decode_32(p);
+ info->dir_end = ceph_decode_8(p);
+ info->dir_complete = ceph_decode_8(p);
+ if (num == 0)
+ goto done;
+
+ /* alloc large array */
+ info->dir_nr = num;
+ info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+ sizeof(*info->dir_dname) +
+ sizeof(*info->dir_dname_len) +
+ sizeof(*info->dir_dlease),
+ GFP_NOFS);
+ if (info->dir_in == NULL) {
+ err = -ENOMEM;
+ goto out_bad;
+ }
+ info->dir_dname = (void *)(info->dir_in + num);
+ info->dir_dname_len = (void *)(info->dir_dname + num);
+ info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+ while (num) {
+ /* dentry */
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ info->dir_dname_len[i] = ceph_decode_32(p);
+ ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+ info->dir_dname[i] = *p;
+ *p += info->dir_dname_len[i];
+ dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+ info->dir_dname[i]);
+ info->dir_dlease[i] = *p;
+ *p += sizeof(struct ceph_mds_reply_lease);
+
+ /* inode */
+ err = parse_reply_info_in(p, end, &info->dir_in[i]);
+ if (err < 0)
+ goto out_bad;
+ i++;
+ num--;
+ }
+
+done:
+ if (*p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing dir contents %d\n", err);
+ return err;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ void *p, *end;
+ u32 len;
+ int err;
+
+ info->head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+ end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+ /* trace */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ err = parse_reply_info_trace(&p, p+len, info);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* dir content */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ err = parse_reply_info_dir(&p, p+len, info);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* snap blob */
+ ceph_decode_32_safe(&p, end, len, bad);
+ info->snapblob_len = len;
+ info->snapblob = p;
+ p += len;
+
+ if (p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("mds parse_reply err %d\n", err);
+ return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+ kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+ switch (s) {
+ case CEPH_MDS_SESSION_NEW: return "new";
+ case CEPH_MDS_SESSION_OPENING: return "opening";
+ case CEPH_MDS_SESSION_OPEN: return "open";
+ case CEPH_MDS_SESSION_HUNG: return "hung";
+ case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+ default: return "???";
+ }
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+ if (atomic_inc_not_zero(&s->s_ref)) {
+ dout("mdsc get_session %p %d -> %d\n", s,
+ atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ return s;
+ } else {
+ dout("mdsc get_session %p 0 -- FAIL", s);
+ return NULL;
+ }
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+ dout("mdsc put_session %p %d -> %d\n", s,
+ atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+ if (atomic_dec_and_test(&s->s_ref)) {
+ if (s->s_authorizer)
+ s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+ s->s_mdsc->client->monc.auth, s->s_authorizer);
+ kfree(s);
+ }
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *session;
+
+ if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+ return NULL;
+ session = mdsc->sessions[mds];
+ dout("lookup_mds_session %p %d\n", session,
+ atomic_read(&session->s_ref));
+ get_session(session);
+ return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+ if (mds >= mdsc->max_sessions)
+ return false;
+ return mdsc->sessions[mds];
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *s;
+
+ s = kzalloc(sizeof(*s), GFP_NOFS);
+ s->s_mdsc = mdsc;
+ s->s_mds = mds;
+ s->s_state = CEPH_MDS_SESSION_NEW;
+ s->s_ttl = 0;
+ s->s_seq = 0;
+ mutex_init(&s->s_mutex);
+
+ ceph_con_init(mdsc->client->msgr, &s->s_con);
+ s->s_con.private = s;
+ s->s_con.ops = &mds_con_ops;
+ s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+ s->s_con.peer_name.num = cpu_to_le64(mds);
+
+ spin_lock_init(&s->s_cap_lock);
+ s->s_cap_gen = 0;
+ s->s_cap_ttl = 0;
+ s->s_renew_requested = 0;
+ s->s_renew_seq = 0;
+ INIT_LIST_HEAD(&s->s_caps);
+ s->s_nr_caps = 0;
+ atomic_set(&s->s_ref, 1);
+ INIT_LIST_HEAD(&s->s_waiting);
+ INIT_LIST_HEAD(&s->s_unsafe);
+ s->s_num_cap_releases = 0;
+ INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_LIST_HEAD(&s->s_cap_releases_done);
+ INIT_LIST_HEAD(&s->s_cap_flushing);
+ INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+ dout("register_session mds%d\n", mds);
+ if (mds >= mdsc->max_sessions) {
+ int newmax = 1 << get_count_order(mds+1);
+ struct ceph_mds_session **sa;
+
+ dout("register_session realloc to %d\n", newmax);
+ sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+ if (sa == NULL)
+ goto fail_realloc;
+ if (mdsc->sessions) {
+ memcpy(sa, mdsc->sessions,
+ mdsc->max_sessions * sizeof(void *));
+ kfree(mdsc->sessions);
+ }
+ mdsc->sessions = sa;
+ mdsc->max_sessions = newmax;
+ }
+ mdsc->sessions[mds] = s;
+ atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+
+ ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ return s;
+
+fail_realloc:
+ kfree(s);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void unregister_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ dout("unregister_session mds%d %p\n", s->s_mds, s);
+ mdsc->sessions[s->s_mds] = NULL;
+ ceph_con_close(&s->s_con);
+ ceph_put_mds_session(s);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+ if (req->r_session) {
+ ceph_put_mds_session(req->r_session);
+ req->r_session = NULL;
+ }
+}
+
+void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+ dout("mdsc put_request %p %d -> %d\n", req,
+ atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1);
+ if (atomic_dec_and_test(&req->r_ref)) {
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply) {
+ ceph_msg_put(req->r_reply);
+ destroy_reply_info(&req->r_reply_info);
+ }
+ if (req->r_inode) {
+ ceph_put_cap_refs(ceph_inode(req->r_inode),
+ CEPH_CAP_PIN);
+ iput(req->r_inode);
+ }
+ if (req->r_locked_dir)
+ ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+ CEPH_CAP_PIN);
+ if (req->r_target_inode)
+ iput(req->r_target_inode);
+ if (req->r_dentry)
+ dput(req->r_dentry);
+ if (req->r_old_dentry) {
+ ceph_put_cap_refs(
+ ceph_inode(req->r_old_dentry->d_parent->d_inode),
+ CEPH_CAP_PIN);
+ dput(req->r_old_dentry);
+ }
+ kfree(req->r_path1);
+ kfree(req->r_path2);
+ put_request_session(req);
+ ceph_unreserve_caps(&req->r_caps_reservation);
+ kfree(req);
+ }
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+ u64 tid)
+{
+ struct ceph_mds_request *req;
+ req = radix_tree_lookup(&mdsc->request_tree, tid);
+ if (req)
+ ceph_mdsc_get_request(req);
+ return req;
+}
+
+/*
+ * Register an in-flight request, and assign a tid. Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ req->r_tid = ++mdsc->last_tid;
+ if (req->r_num_caps)
+ ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+ dout("__register_request %p tid %lld\n", req, req->r_tid);
+ ceph_mdsc_get_request(req);
+ radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req);
+
+ if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
+ spin_lock(&ci->i_unsafe_lock);
+ req->r_unsafe_dir = dir;
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ radix_tree_delete(&mdsc->request_tree, req->r_tid);
+ ceph_mdsc_put_request(req);
+
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_dir_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+}
+
+/*
+ * Choose mds to send request to next. If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds. If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ int mode = req->r_direct_mode;
+ int mds = -1;
+ u32 hash = req->r_direct_hash;
+ bool is_hash = req->r_direct_is_hash;
+
+ /*
+ * is there a specific mds we should try? ignore hint if we have
+ * no session and the mds is not up (active or recovering).
+ */
+ if (req->r_resend_mds >= 0 &&
+ (__have_session(mdsc, req->r_resend_mds) ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+ dout("choose_mds using resend_mds mds%d\n",
+ req->r_resend_mds);
+ return req->r_resend_mds;
+ }
+
+ if (mode == USE_RANDOM_MDS)
+ goto random;
+
+ inode = NULL;
+ if (req->r_inode) {
+ inode = req->r_inode;
+ } else if (req->r_dentry) {
+ if (req->r_dentry->d_inode) {
+ inode = req->r_dentry->d_inode;
+ } else {
+ inode = req->r_dentry->d_parent->d_inode;
+ hash = req->r_dentry->d_name.hash;
+ is_hash = true;
+ }
+ }
+ dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+ (int)hash, mode);
+ if (!inode)
+ goto random;
+ ci = ceph_inode(inode);
+
+ if (is_hash && S_ISDIR(inode->i_mode)) {
+ struct ceph_inode_frag frag;
+ int found;
+
+ ceph_choose_frag(ci, hash, &frag, &found);
+ if (found) {
+ if (mode == USE_ANY_MDS && frag.ndist > 0) {
+ u8 r;
+
+ /* choose a random replica */
+ get_random_bytes(&r, 1);
+ r %= frag.ndist;
+ mds = frag.dist[r];
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode),
+ frag.frag, frag.mds,
+ (int)r, frag.ndist);
+ return mds;
+ }
+
+ /* since this file/dir wasn't known to be
+ * replicated, then we want to look for the
+ * authoritative mds. */
+ mode = USE_AUTH_MDS;
+ if (frag.mds >= 0) {
+ /* choose auth mds */
+ mds = frag.mds;
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
+ return mds;
+ }
+ }
+ }
+
+ spin_lock(&inode->i_lock);
+ cap = NULL;
+ if (mode == USE_AUTH_MDS)
+ cap = ci->i_auth_cap;
+ if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+ cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+ if (!cap) {
+ spin_unlock(&inode->i_lock);
+ goto random;
+ }
+ mds = cap->session->s_mds;
+ dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+ inode, ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
+ spin_unlock(&inode->i_lock);
+ return mds;
+
+random:
+ mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+ dout("choose_mds chose random mds%d\n", mds);
+ return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+ if (IS_ERR(msg)) {
+ pr_err("create_session_msg ENOMEM creating msg\n");
+ return ERR_PTR(PTR_ERR(msg));
+ }
+ h = msg->front.iov_base;
+ h->op = cpu_to_le32(op);
+ h->seq = cpu_to_le64(seq);
+ return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int mstate;
+ int mds = session->s_mds;
+ int err = 0;
+
+ /* wait for mds to go active? */
+ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+ dout("open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
+ session->s_state = CEPH_MDS_SESSION_OPENING;
+ session->s_renew_requested = jiffies;
+
+ /* send connect message */
+ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+ if (IS_ERR(msg)) {
+ err = PTR_ERR(msg);
+ goto out;
+ }
+ ceph_con_send(&session->s_con, msg);
+
+out:
+ return 0;
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+
+ spin_lock(&session->s_cap_lock);
+ while (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ }
+ while (!list_empty(&session->s_cap_releases_done)) {
+ msg = list_first_entry(&session->s_cap_releases_done,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ }
+ spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session.
+ *
+ * caller must hold session s_mutex
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+ int (*cb)(struct inode *, struct ceph_cap *,
+ void *), void *arg)
+{
+ struct ceph_cap *cap, *ncap;
+ struct inode *inode;
+ int ret;
+
+ dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ list_for_each_entry_safe(cap, ncap, &session->s_caps, session_caps) {
+ inode = igrab(&cap->ci->vfs_inode);
+ if (!inode)
+ continue;
+ spin_unlock(&session->s_cap_lock);
+ ret = cb(inode, cap, arg);
+ iput(inode);
+ if (ret < 0)
+ return ret;
+ spin_lock(&session->s_cap_lock);
+ }
+ spin_unlock(&session->s_cap_lock);
+
+ return 0;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ dout("removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->vfs_inode);
+ ceph_remove_cap(cap);
+ return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+ dout("remove_session_caps on %p\n", session);
+ iterate_session_caps(session, remove_session_caps_cb, NULL);
+ BUG_ON(session->s_nr_caps > 0);
+ cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps. if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ wake_up(&ci->i_cap_wq);
+ if (arg) {
+ spin_lock(&inode->i_lock);
+ ci->i_wanted_max_size = 0;
+ ci->i_requested_max_size = 0;
+ spin_unlock(&inode->i_lock);
+ }
+ return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+ int reconnect)
+{
+ dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+ iterate_session_caps(session, wake_up_session_cb,
+ (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps. The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int state;
+
+ if (time_after_eq(jiffies, session->s_cap_ttl) &&
+ time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+ pr_info("mds%d caps stale\n", session->s_mds);
+
+ /* do not try to renew caps until a recovering mds has reconnected
+ * with its clients. */
+ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+ if (state < CEPH_MDS_STATE_RECONNECT) {
+ dout("send_renew_caps ignoring mds%d (%s)\n",
+ session->s_mds, ceph_mds_state_name(state));
+ return 0;
+ }
+
+ dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
+ session->s_renew_requested = jiffies;
+ msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, int is_renew)
+{
+ int was_stale;
+ int wake = 0;
+
+ spin_lock(&session->s_cap_lock);
+ was_stale = is_renew && (session->s_cap_ttl == 0 ||
+ time_after_eq(jiffies, session->s_cap_ttl));
+
+ session->s_cap_ttl = session->s_renew_requested +
+ mdsc->mdsmap->m_session_timeout*HZ;
+
+ if (was_stale) {
+ if (time_before(jiffies, session->s_cap_ttl)) {
+ pr_info("mds%d caps renewed\n", session->s_mds);
+ wake = 1;
+ } else {
+ pr_info("mds%d caps still stale\n", session->s_mds);
+ }
+ }
+ dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+ session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ spin_unlock(&session->s_cap_lock);
+
+ if (wake)
+ wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int err = 0;
+
+ dout("request_close_session mds%d state %s seq %lld\n",
+ session->s_mds, session_state_name(session->s_state),
+ session->s_seq);
+ msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ if (IS_ERR(msg))
+ err = PTR_ERR(msg);
+ else
+ ceph_con_send(&session->s_con, msg);
+ return err;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+ return 0;
+ session->s_state = CEPH_MDS_SESSION_CLOSING;
+ return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy. Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+ struct ceph_mds_session *session = arg;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int used, oissued, mine;
+
+ if (session->s_trim_caps <= 0)
+ return -1;
+
+ spin_lock(&inode->i_lock);
+ mine = cap->issued | cap->implemented;
+ used = __ceph_caps_used(ci);
+ oissued = __ceph_caps_issued_other(ci, cap);
+
+ dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+ inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+ ceph_cap_string(used));
+ if (ci->i_dirty_caps)
+ goto out; /* dirty caps */
+ if ((used & ~oissued) & mine)
+ goto out; /* we need these caps */
+
+ session->s_trim_caps--;
+ if (oissued) {
+ /* we aren't the only cap.. just remove us */
+ __ceph_remove_cap(cap, NULL);
+ } else {
+ /* try to drop referring dentries */
+ spin_unlock(&inode->i_lock);
+ d_prune_aliases(inode);
+ dout("trim_caps_cb %p cap %p pruned, count now %d\n",
+ inode, cap, atomic_read(&inode->i_count));
+ return 0;
+ }
+
+out:
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps)
+{
+ int trim_caps = session->s_nr_caps - max_caps;
+
+ dout("trim_caps mds%d start: %d / %d, trim %d\n",
+ session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ if (trim_caps > 0) {
+ session->s_trim_caps = trim_caps;
+ iterate_session_caps(session, trim_caps_cb, session);
+ dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - session->s_trim_caps);
+ }
+ return 0;
+}
+
+/*
+ * Allocate cap_release messages. If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int extra)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ int err = -ENOMEM;
+
+ if (extra < 0)
+ extra = mdsc->client->mount_args->cap_release_safety;
+
+ spin_lock(&session->s_cap_lock);
+
+ if (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg,
+ list_head);
+ head = msg->front.iov_base;
+ extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+ }
+
+ while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+ spin_unlock(&session->s_cap_lock);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+ 0, 0, NULL);
+ if (!msg)
+ goto out_unlocked;
+ dout("add_cap_releases %p msg %p now %d\n", session, msg,
+ (int)msg->front.iov_len);
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ spin_lock(&session->s_cap_lock);
+ list_add(&msg->list_head, &session->s_cap_releases);
+ session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+ }
+
+ if (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg,
+ list_head);
+ head = msg->front.iov_base;
+ if (head->num) {
+ dout(" queueing non-full %p (%d)\n", msg,
+ le32_to_cpu(head->num));
+ list_move_tail(&msg->list_head,
+ &session->s_cap_releases_done);
+ session->s_num_cap_releases -=
+ CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+ }
+ }
+ err = 0;
+ spin_unlock(&session->s_cap_lock);
+out_unlocked:
+ return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+ int mds, ret = 1;
+
+ dout("check_cap_flush want %lld\n", want_flush_seq);
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session = mdsc->sessions[mds];
+
+ if (!session)
+ continue;
+ get_session(session);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ if (!list_empty(&session->s_cap_flushing)) {
+ struct ceph_inode_info *ci =
+ list_entry(session->s_cap_flushing.next,
+ struct ceph_inode_info,
+ i_flushing_item);
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&inode->i_lock);
+ if (ci->i_cap_flush_seq <= want_flush_seq) {
+ dout("check_cap_flush still flushing %p "
+ "seq %lld <= %lld to mds%d\n", inode,
+ ci->i_cap_flush_seq, want_flush_seq,
+ session->s_mds);
+ ret = 0;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+
+ if (!ret)
+ return ret;
+ mutex_lock(&mdsc->mutex);
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+ return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+
+ dout("send_cap_releases mds%d\n", session->s_mds);
+ while (1) {
+ spin_lock(&session->s_cap_lock);
+ if (list_empty(&session->s_cap_releases_done))
+ break;
+ msg = list_first_entry(&session->s_cap_releases_done,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ spin_unlock(&session->s_cap_lock);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ }
+ spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+ struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_started = jiffies;
+ req->r_resend_mds = -1;
+ INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ req->r_fmode = -1;
+ atomic_set(&req->r_ref, 1); /* one for request_tree, one for caller */
+ INIT_LIST_HEAD(&req->r_wait);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ req->r_op = op;
+ req->r_direct_mode = mode;
+ return req;
+}
+
+/*
+ * return oldest (lowest) tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_request *first;
+ if (radix_tree_gang_lookup(&mdsc->request_tree,
+ (void **)&first, 0, 1) <= 0)
+ return 0;
+ return first->r_tid;
+}
+
+/*
+ * Build a dentry's path. Allocate on heap; caller must kfree. Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ * foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+ int stop_on_nosnap)
+{
+ struct dentry *temp;
+ char *path;
+ int len, pos;
+
+ if (dentry == NULL)
+ return ERR_PTR(-EINVAL);
+
+retry:
+ len = 0;
+ for (temp = dentry; !IS_ROOT(temp);) {
+ struct inode *inode = temp->d_inode;
+ if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+ len++; /* slash only */
+ else if (stop_on_nosnap && inode &&
+ ceph_snap(inode) == CEPH_NOSNAP)
+ break;
+ else
+ len += 1 + temp->d_name.len;
+ temp = temp->d_parent;
+ if (temp == NULL) {
+ pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+ if (len)
+ len--; /* no leading '/' */
+
+ path = kmalloc(len+1, GFP_NOFS);
+ if (path == NULL)
+ return ERR_PTR(-ENOMEM);
+ pos = len;
+ path[pos] = 0; /* trailing null */
+ for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+ struct inode *inode = temp->d_inode;
+
+ if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+ dout("build_path_dentry path+%d: %p SNAPDIR\n",
+ pos, temp);
+ } else if (stop_on_nosnap && inode &&
+ ceph_snap(inode) == CEPH_NOSNAP) {
+ break;
+ } else {
+ pos -= temp->d_name.len;
+ if (pos < 0)
+ break;
+ strncpy(path + pos, temp->d_name.name,
+ temp->d_name.len);
+ dout("build_path_dentry path+%d: %p '%.*s'\n",
+ pos, temp, temp->d_name.len, path + pos);
+ }
+ if (pos)
+ path[--pos] = '/';
+ temp = temp->d_parent;
+ if (temp == NULL) {
+ pr_err("build_path_dentry corrupt dentry\n");
+ kfree(path);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+ if (pos != 0) {
+ pr_err("build_path_dentry did not end path lookup where "
+ "expected, namelen is %d, pos is %d\n", len, pos);
+ /* presumably this is only possible if racing with a
+ rename of one of the parent directories (we can not
+ lock the dentries above us to prevent this, but
+ retrying should be harmless) */
+ kfree(path);
+ goto retry;
+ }
+
+ *base = ceph_ino(temp->d_inode);
+ *plen = len;
+ dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+ dentry, atomic_read(&dentry->d_count), *base, len, path);
+ return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+ const char **ppath, int *ppathlen, u64 *pino,
+ int *pfreepath)
+{
+ char *path;
+
+ if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+ *pino = ceph_ino(dentry->d_parent->d_inode);
+ *ppath = dentry->d_name.name;
+ *ppathlen = dentry->d_name.len;
+ return 0;
+ }
+ path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ *ppath = path;
+ *pfreepath = 1;
+ return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+ const char **ppath, int *ppathlen, u64 *pino,
+ int *pfreepath)
+{
+ struct dentry *dentry;
+ char *path;
+
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ *pino = ceph_ino(inode);
+ *ppathlen = 0;
+ return 0;
+ }
+ dentry = d_find_alias(inode);
+ path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ dput(dentry);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ *ppath = path;
+ *pfreepath = 1;
+ return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+ const char *rpath, u64 rino,
+ const char **ppath, int *pathlen,
+ u64 *ino, int *freepath)
+{
+ int r = 0;
+
+ if (rinode) {
+ r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
+ } else if (rdentry) {
+ r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+ dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+ *ppath);
+ } else if (rpath) {
+ *ino = rino;
+ *ppath = rpath;
+ *pathlen = strlen(rpath);
+ dout(" path %.*s\n", *pathlen, rpath);
+ }
+
+ return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ int mds)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_request_head *head;
+ const char *path1 = NULL;
+ const char *path2 = NULL;
+ u64 ino1 = 0, ino2 = 0;
+ int pathlen1 = 0, pathlen2 = 0;
+ int freepath1 = 0, freepath2 = 0;
+ int len;
+ u16 releases;
+ void *p, *end;
+ int ret;
+
+ ret = set_request_path_attr(req->r_inode, req->r_dentry,
+ req->r_path1, req->r_ino1.ino,
+ &path1, &pathlen1, &ino1, &freepath1);
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out;
+ }
+
+ ret = set_request_path_attr(NULL, req->r_old_dentry,
+ req->r_path2, req->r_ino2.ino,
+ &path2, &pathlen2, &ino2, &freepath2);
+ if (ret < 0) {
+ msg = ERR_PTR(ret);
+ goto out_free1;
+ }
+
+ len = sizeof(*head) +
+ pathlen1 + pathlen2 + 2*(sizeof(u32) + sizeof(u64));
+
+ /* calculate (max) length for cap releases */
+ len += sizeof(struct ceph_mds_request_release) *
+ (!!req->r_inode_drop + !!req->r_dentry_drop +
+ !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+ if (req->r_dentry_drop)
+ len += req->r_dentry->d_name.len;
+ if (req->r_old_dentry_drop)
+ len += req->r_old_dentry->d_name.len;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+ if (IS_ERR(msg))
+ goto out_free2;
+
+ head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(*head);
+ end = msg->front.iov_base + msg->front.iov_len;
+
+ head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+ head->op = cpu_to_le32(req->r_op);
+ head->caller_uid = cpu_to_le32(current_fsuid());
+ head->caller_gid = cpu_to_le32(current_fsgid());
+ head->args = req->r_args;
+
+ ceph_encode_filepath(&p, end, ino1, path1);
+ ceph_encode_filepath(&p, end, ino2, path2);
+
+ /* cap releases */
+ releases = 0;
+ if (req->r_inode_drop)
+ releases += ceph_encode_inode_release(&p,
+ req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+ mds, req->r_inode_drop, req->r_inode_unless, 0);
+ if (req->r_dentry_drop)
+ releases += ceph_encode_dentry_release(&p, req->r_dentry,
+ mds, req->r_dentry_drop, req->r_dentry_unless);
+ if (req->r_old_dentry_drop)
+ releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+ mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+ if (req->r_old_inode_drop)
+ releases += ceph_encode_inode_release(&p,
+ req->r_old_dentry->d_inode,
+ mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+ head->num_releases = cpu_to_le16(releases);
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+ msg->pages = req->r_pages;
+ msg->nr_pages = req->r_num_pages;
+ msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+ msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+ if (freepath2)
+ kfree((char *)path2);
+out_free1:
+ if (freepath1)
+ kfree((char *)path1);
+out:
+ return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ if (req->r_callback)
+ req->r_callback(mdsc, req);
+ else
+ complete(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ int mds)
+{
+ struct ceph_mds_request_head *rhead;
+ struct ceph_msg *msg;
+ int flags = 0;
+
+ req->r_mds = mds;
+ req->r_attempts++;
+ dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+ if (req->r_request) {
+ ceph_msg_put(req->r_request);
+ req->r_request = NULL;
+ }
+ msg = create_request_message(mdsc, req, mds);
+ if (IS_ERR(msg)) {
+ req->r_reply = ERR_PTR(PTR_ERR(msg));
+ complete_request(mdsc, req);
+ return -PTR_ERR(msg);
+ }
+ req->r_request = msg;
+
+ rhead = msg->front.iov_base;
+ rhead->tid = cpu_to_le64(req->r_tid);
+ rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+ if (req->r_got_unsafe)
+ flags |= CEPH_MDS_FLAG_REPLAY;
+ if (req->r_locked_dir)
+ flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+ rhead->flags = cpu_to_le32(flags);
+ rhead->num_fwd = req->r_num_fwd;
+ rhead->num_retry = req->r_attempts - 1;
+
+ dout(" r_locked_dir = %p\n", req->r_locked_dir);
+
+ if (req->r_target_inode && req->r_got_unsafe)
+ rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+ else
+ rhead->ino = 0;
+ return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_mds_session *session = NULL;
+ int mds = -1;
+ int err = -EAGAIN;
+
+ if (req->r_reply)
+ goto out;
+
+ if (req->r_timeout &&
+ time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+ dout("do_request timed out\n");
+ err = -EIO;
+ goto finish;
+ }
+
+ mds = __choose_mds(mdsc, req);
+ if (mds < 0 ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ dout("do_request no mds or not active, waiting for map\n");
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ goto out;
+ }
+
+ /* get, open session */
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ session = register_session(mdsc, mds);
+ dout("do_request mds%d session %p state %s\n", mds, session,
+ session_state_name(session->s_state));
+ if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+ session->s_state != CEPH_MDS_SESSION_HUNG) {
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING)
+ __open_session(mdsc, session);
+ list_add(&req->r_wait, &session->s_waiting);
+ goto out_session;
+ }
+
+ /* send request */
+ req->r_session = get_session(session);
+ req->r_resend_mds = -1; /* forget any previous mds hint */
+
+ if (req->r_request_started == 0) /* note request start time */
+ req->r_request_started = jiffies;
+
+ err = __prepare_send_request(mdsc, req, mds);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+
+out_session:
+ ceph_put_mds_session(session);
+out:
+ return err;
+
+finish:
+ req->r_reply = ERR_PTR(err);
+ complete_request(mdsc, req);
+ goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head)
+{
+ struct ceph_mds_request *req, *nreq;
+
+ list_for_each_entry_safe(req, nreq, head, r_wait) {
+ list_del_init(&req->r_wait);
+ __do_request(mdsc, req);
+ }
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds. If @all is set,
+ * wake up if their requests has been forwarded to @mds, too.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+{
+ struct ceph_mds_request *reqs[10];
+ u64 nexttid = 0;
+ int i, got;
+
+ dout("kick_requests mds%d\n", mds);
+ while (nexttid <= mdsc->last_tid) {
+ got = radix_tree_gang_lookup(&mdsc->request_tree,
+ (void **)&reqs, nexttid, 10);
+ if (got == 0)
+ break;
+ nexttid = reqs[got-1]->r_tid + 1;
+ for (i = 0; i < got; i++) {
+ if (reqs[i]->r_got_unsafe)
+ continue;
+ if (reqs[i]->r_session &&
+ reqs[i]->r_session->s_mds == mds) {
+ dout(" kicking tid %llu\n", reqs[i]->r_tid);
+ put_request_session(reqs[i]);
+ __do_request(mdsc, reqs[i]);
+ }
+ }
+ }
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ dout("submit_request on %p\n", req);
+ mutex_lock(&mdsc->mutex);
+ __register_request(mdsc, req, NULL);
+ __do_request(mdsc, req);
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request. Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req)
+{
+ int err;
+
+ dout("do_request on %p\n", req);
+
+ /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+ if (req->r_inode)
+ ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+ if (req->r_locked_dir)
+ ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+ if (req->r_old_dentry)
+ ceph_get_cap_refs(
+ ceph_inode(req->r_old_dentry->d_parent->d_inode),
+ CEPH_CAP_PIN);
+
+ /* issue */
+ mutex_lock(&mdsc->mutex);
+ __register_request(mdsc, req, dir);
+ __do_request(mdsc, req);
+
+ /* wait */
+ if (!req->r_reply) {
+ mutex_unlock(&mdsc->mutex);
+ if (req->r_timeout) {
+ err = wait_for_completion_timeout(&req->r_completion,
+ req->r_timeout);
+ if (err > 0)
+ err = 0;
+ else if (err == 0)
+ req->r_reply = ERR_PTR(-EIO);
+ } else {
+ wait_for_completion(&req->r_completion);
+ }
+ mutex_lock(&mdsc->mutex);
+ }
+
+ if (IS_ERR(req->r_reply)) {
+ err = PTR_ERR(req->r_reply);
+ req->r_reply = NULL;
+
+ /* clean up */
+ __unregister_request(mdsc, req);
+ if (!list_empty(&req->r_unsafe_item))
+ list_del_init(&req->r_unsafe_item);
+ complete(&req->r_safe_completion);
+ } else if (req->r_err) {
+ err = req->r_err;
+ } else {
+ err = le32_to_cpu(req->r_reply_info.head->result);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ dout("do_request %p done, result %d\n", req, err);
+ return err;
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_mds_reply_head *head = msg->front.iov_base;
+ struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ u64 tid;
+ int err, result;
+ int mds;
+
+ if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+ return;
+ if (msg->front.iov_len < sizeof(*head)) {
+ pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+ return;
+ }
+
+ /* get request, session */
+ tid = le64_to_cpu(head->tid);
+ mutex_lock(&mdsc->mutex);
+ req = __lookup_request(mdsc, tid);
+ if (!req) {
+ dout("handle_reply on unknown tid %llu\n", tid);
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+ dout("handle_reply %p\n", req);
+ mds = le64_to_cpu(msg->hdr.src.name.num);
+
+ /* correct session? */
+ if (!req->r_session && req->r_session != session) {
+ pr_err("mdsc_handle_reply got %llu on session mds%d"
+ " not mds%d\n", tid, session->s_mds,
+ req->r_session ? req->r_session->s_mds : -1);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+
+ /* dup? */
+ if ((req->r_got_unsafe && !head->safe) ||
+ (req->r_got_safe && head->safe)) {
+ pr_warning("got a dup %s reply on %llu from mds%d\n",
+ head->safe ? "safe" : "unsafe", tid, mds);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+
+ result = le32_to_cpu(head->result);
+
+ /*
+ * Tolerate 2 consecutive ESTALEs from the same mds.
+ * FIXME: we should be looking at the cap migrate_seq.
+ */
+ if (result == -ESTALE) {
+ req->r_direct_mode = USE_AUTH_MDS;
+ req->r_num_stale++;
+ if (req->r_num_stale <= 2) {
+ __do_request(mdsc, req);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ } else {
+ req->r_num_stale = 0;
+ }
+
+ if (head->safe) {
+ req->r_got_safe = true;
+ __unregister_request(mdsc, req);
+ complete(&req->r_safe_completion);
+
+ if (req->r_got_unsafe) {
+ /*
+ * We already handled the unsafe response, now do the
+ * cleanup. No need to examine the response; the MDS
+ * doesn't include any result info in the safe
+ * response. And even if it did, there is nothing
+ * useful we could do with a revised return value.
+ */
+ dout("got safe reply %llu, mds%d\n", tid, mds);
+ list_del_init(&req->r_unsafe_item);
+
+ /* last unsafe request during umount? */
+ if (mdsc->stopping && !__get_oldest_tid(mdsc))
+ complete(&mdsc->safe_umount_waiters);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
+ }
+
+ BUG_ON(req->r_reply);
+
+ if (!head->safe) {
+ req->r_got_unsafe = true;
+ list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ }
+
+ dout("handle_reply tid %lld result %d\n", tid, result);
+ rinfo = &req->r_reply_info;
+ err = parse_reply_info(msg, rinfo);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ if (err < 0) {
+ pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+ goto out_err;
+ }
+
+ /* snap trace */
+ if (rinfo->snapblob_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, rinfo->snapblob,
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
+
+ /* insert trace into our cache */
+ err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+ if (err == 0) {
+ if (result == 0 && rinfo->dir_nr)
+ ceph_readdir_prepopulate(req, req->r_session);
+ ceph_unreserve_caps(&req->r_caps_reservation);
+ }
+
+ up_read(&mdsc->snap_rwsem);
+out_err:
+ if (err) {
+ req->r_err = err;
+ } else {
+ req->r_reply = msg;
+ ceph_msg_get(msg);
+ }
+
+ add_cap_releases(mdsc, req->r_session, -1);
+ mutex_unlock(&session->s_mutex);
+
+ /* kick calling process */
+ complete_request(mdsc, req);
+out:
+ ceph_mdsc_put_request(req);
+ return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_mds_request *req;
+ u64 tid;
+ u32 next_mds;
+ u32 fwd_seq;
+ u8 must_resend;
+ int err = -EINVAL;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ int from_mds, state;
+
+ if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+ goto bad;
+ from_mds = le64_to_cpu(msg->hdr.src.name.num);
+
+ ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
+ tid = ceph_decode_64(&p);
+ next_mds = ceph_decode_32(&p);
+ fwd_seq = ceph_decode_32(&p);
+ must_resend = ceph_decode_8(&p);
+
+ WARN_ON(must_resend); /* shouldn't happen. */
+
+ mutex_lock(&mdsc->mutex);
+ req = __lookup_request(mdsc, tid);
+ if (!req) {
+ dout("forward %llu dne\n", tid);
+ goto out; /* dup reply? */
+ }
+
+ state = mdsc->sessions[next_mds]->s_state;
+ if (fwd_seq <= req->r_num_fwd) {
+ dout("forward %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ } else {
+ /* resend. forward race not possible; mds would drop */
+ dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+ req->r_num_fwd = fwd_seq;
+ req->r_resend_mds = next_mds;
+ put_request_session(req);
+ __do_request(mdsc, req);
+ }
+ ceph_mdsc_put_request(req);
+out:
+ mutex_unlock(&mdsc->mutex);
+ return;
+
+bad:
+ pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ u32 op;
+ u64 seq;
+ int mds;
+ struct ceph_mds_session_head *h = msg->front.iov_base;
+ int wake = 0;
+
+ if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+ return;
+ mds = le64_to_cpu(msg->hdr.src.name.num);
+
+ /* decode */
+ if (msg->front.iov_len != sizeof(*h))
+ goto bad;
+ op = le32_to_cpu(h->op);
+ seq = le64_to_cpu(h->seq);
+
+ mutex_lock(&mdsc->mutex);
+ /* FIXME: this ttl calculation is generous */
+ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+
+ dout("handle_session mds%d %s %p state %s seq %llu\n",
+ mds, ceph_session_op_name(op), session,
+ session_state_name(session->s_state), seq);
+
+ if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ pr_info("mds%d came back\n", session->s_mds);
+ }
+
+ switch (op) {
+ case CEPH_SESSION_OPEN:
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ renewed_caps(mdsc, session, 0);
+ wake = 1;
+ if (mdsc->stopping)
+ __close_session(mdsc, session);
+ break;
+
+ case CEPH_SESSION_RENEWCAPS:
+ if (session->s_renew_seq == seq)
+ renewed_caps(mdsc, session, 1);
+ break;
+
+ case CEPH_SESSION_CLOSE:
+ unregister_session(mdsc, session);
+ remove_session_caps(session);
+ wake = 1; /* for good measure */
+ complete(&mdsc->session_close_waiters);
+ kick_requests(mdsc, mds, 0); /* cur only */
+ break;
+
+ case CEPH_SESSION_STALE:
+ pr_info("mds%d caps went stale, renewing\n",
+ session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ session->s_cap_gen++;
+ session->s_cap_ttl = 0;
+ spin_unlock(&session->s_cap_lock);
+ send_renew_caps(mdsc, session);
+ break;
+
+ case CEPH_SESSION_RECALL_STATE:
+ trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+ break;
+
+ default:
+ pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+ WARN_ON(1);
+ }
+
+ mutex_unlock(&session->s_mutex);
+ if (wake) {
+ mutex_lock(&mdsc->mutex);
+ __wake_requests(mdsc, &session->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+ }
+ return;
+
+bad:
+ pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+ (int)msg->front.iov_len);
+ return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_request *req, *nreq;
+ int err;
+
+ dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+ mutex_lock(&mdsc->mutex);
+ list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+ err = __prepare_send_request(mdsc, req, session->s_mds);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+struct encode_caps_data {
+ void **pp;
+ void *end;
+ int *num_caps;
+};
+
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_mds_cap_reconnect *rec;
+ struct ceph_inode_info *ci;
+ struct encode_caps_data *data = (struct encode_caps_data *)arg;
+ void *p = *(data->pp);
+ void *end = data->end;
+ char *path;
+ int pathlen, err;
+ u64 pathbase;
+ struct dentry *dentry;
+
+ ci = cap->ci;
+
+ dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+ inode, ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
+ ceph_decode_need(&p, end, sizeof(u64), needmore);
+ ceph_encode_64(&p, ceph_ino(inode));
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ BUG_ON(err);
+ }
+ } else {
+ path = NULL;
+ pathlen = 0;
+ }
+ ceph_decode_need(&p, end, pathlen+4, needmore);
+ ceph_encode_string(&p, end, path, pathlen);
+
+ ceph_decode_need(&p, end, sizeof(*rec), needmore);
+ rec = p;
+ p += sizeof(*rec);
+ BUG_ON(p > end);
+ spin_lock(&inode->i_lock);
+ cap->seq = 0; /* reset cap seq */
+ cap->issue_seq = 0; /* and issue_seq */
+ rec->cap_id = cpu_to_le64(cap->cap_id);
+ rec->pathbase = cpu_to_le64(pathbase);
+ rec->wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+ rec->issued = cpu_to_le32(cap->issued);
+ rec->size = cpu_to_le64(inode->i_size);
+ ceph_encode_timespec(&rec->mtime, &inode->i_mtime);
+ ceph_encode_timespec(&rec->atime, &inode->i_atime);
+ rec->snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+ spin_unlock(&inode->i_lock);
+
+ kfree(path);
+ dput(dentry);
+ (*data->num_caps)++;
+ *(data->pp) = p;
+ return 0;
+needmore:
+ return -ENOSPC;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state. This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy. Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+{
+ struct ceph_mds_session *session;
+ struct ceph_msg *reply;
+ int newlen, len = 4 + 1;
+ void *p, *end;
+ int err;
+ int num_caps, num_realms = 0;
+ int got;
+ u64 next_snap_ino = 0;
+ __le32 *pnum_caps, *pnum_realms;
+ struct encode_caps_data iter_args;
+
+ pr_info("reconnect to recovering mds%d\n", mds);
+
+ /* find session */
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex); /* drop lock for duration */
+
+ if (session) {
+ mutex_lock(&session->s_mutex);
+
+ session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+ session->s_seq = 0;
+
+ ceph_con_open(&session->s_con,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ /* replay unsafe requests */
+ replay_unsafe_requests(mdsc, session);
+
+ /* estimate needed space */
+ len += session->s_nr_caps *
+ (100+sizeof(struct ceph_mds_cap_reconnect));
+ pr_info("estimating i need %d bytes for %d caps\n",
+ len, session->s_nr_caps);
+ } else {
+ dout("no session for mds%d, will send short reconnect\n",
+ mds);
+ }
+
+ down_read(&mdsc->snap_rwsem);
+
+retry:
+ /* build reply */
+ reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, len, 0, 0, NULL);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ pr_err("send_mds_reconnect ENOMEM on %d for mds%d\n",
+ len, mds);
+ goto out;
+ }
+ p = reply->front.iov_base;
+ end = p + len;
+
+ if (!session) {
+ ceph_encode_8(&p, 1); /* session was closed */
+ ceph_encode_32(&p, 0);
+ goto send;
+ }
+ dout("session %p state %s\n", session,
+ session_state_name(session->s_state));
+
+ /* traverse this session's caps */
+ ceph_encode_8(&p, 0);
+ pnum_caps = p;
+ ceph_encode_32(&p, session->s_nr_caps);
+ num_caps = 0;
+
+ iter_args.pp = &p;
+ iter_args.end = end;
+ iter_args.num_caps = &num_caps;
+ err = iterate_session_caps(session, encode_caps_cb, &iter_args);
+ if (err == -ENOSPC)
+ goto needmore;
+ if (err < 0)
+ goto out;
+ *pnum_caps = cpu_to_le32(num_caps);
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ next_snap_ino = 0;
+ /* save some space for the snaprealm count */
+ pnum_realms = p;
+ ceph_decode_need(&p, end, sizeof(*pnum_realms), needmore);
+ p += sizeof(*pnum_realms);
+ num_realms = 0;
+ while (1) {
+ struct ceph_snap_realm *realm;
+ struct ceph_mds_snaprealm_reconnect *sr_rec;
+ got = radix_tree_gang_lookup(&mdsc->snap_realms,
+ (void **)&realm, next_snap_ino, 1);
+ if (!got)
+ break;
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ ceph_decode_need(&p, end, sizeof(*sr_rec), needmore);
+ sr_rec = p;
+ sr_rec->ino = cpu_to_le64(realm->ino);
+ sr_rec->seq = cpu_to_le64(realm->seq);
+ sr_rec->parent = cpu_to_le64(realm->parent_ino);
+ p += sizeof(*sr_rec);
+ num_realms++;
+ next_snap_ino = realm->ino + 1;
+ }
+ *pnum_realms = cpu_to_le32(num_realms);
+
+send:
+ reply->front.iov_len = p - reply->front.iov_base;
+ reply->hdr.front_len = cpu_to_le32(reply->front.iov_len);
+ dout("final len was %u (guessed %d)\n",
+ (unsigned)reply->front.iov_len, len);
+ ceph_con_send(&session->s_con, reply);
+
+ if (session) {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ __wake_requests(mdsc, &session->s_waiting);
+ }
+
+out:
+ up_read(&mdsc->snap_rwsem);
+ if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ mutex_lock(&mdsc->mutex);
+ return;
+
+needmore:
+ /*
+ * we need a larger buffer. this doesn't very accurately
+ * factor in snap realms, but it's safe.
+ */
+ num_caps += num_realms;
+ newlen = len * ((100 * (session->s_nr_caps+3)) / (num_caps + 1)) / 100;
+ pr_info("i guessed %d, and did %d of %d caps, retrying with %d\n",
+ len, num_caps, session->s_nr_caps, newlen);
+ len = newlen;
+ ceph_msg_put(reply);
+ goto retry;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+ struct ceph_mdsmap *newmap,
+ struct ceph_mdsmap *oldmap)
+{
+ int i;
+ int oldstate, newstate;
+ struct ceph_mds_session *s;
+
+ dout("check_new_map new %u old %u\n",
+ newmap->m_epoch, oldmap->m_epoch);
+
+ for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ if (mdsc->sessions[i] == NULL)
+ continue;
+ s = mdsc->sessions[i];
+ oldstate = ceph_mdsmap_get_state(oldmap, i);
+ newstate = ceph_mdsmap_get_state(newmap, i);
+
+ dout("check_new_map mds%d state %s -> %s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mds_state_name(newstate),
+ session_state_name(s->s_state));
+
+ if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+ ceph_mdsmap_get_addr(newmap, i),
+ sizeof(struct ceph_entity_addr))) {
+ if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+ /* the session never opened, just close it
+ * out now */
+ __wake_requests(mdsc, &s->s_waiting);
+ unregister_session(mdsc, s);
+ } else {
+ /* just close it */
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
+ ceph_con_close(&s->s_con);
+ mutex_unlock(&s->s_mutex);
+ s->s_state = CEPH_MDS_SESSION_RESTARTING;
+ }
+
+ /* kick any requests waiting on the recovering mds */
+ kick_requests(mdsc, i, 1);
+ } else if (oldstate == newstate) {
+ continue; /* nothing new with this mds */
+ }
+
+ /*
+ * send reconnect?
+ */
+ if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+ newstate >= CEPH_MDS_STATE_RECONNECT)
+ send_mds_reconnect(mdsc, i);
+
+ /*
+ * kick requests on any mds that has gone active.
+ *
+ * kick requests on cur or forwarder: we may have sent
+ * the request to mds1, mds1 told us it forwarded it
+ * to mds2, but then we learn mds1 failed and can't be
+ * sure it successfully forwarded our request before
+ * it died.
+ */
+ if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+ newstate >= CEPH_MDS_STATE_ACTIVE) {
+ pr_info("mds%d reconnect completed\n", s->s_mds);
+ kick_requests(mdsc, i, 1);
+ ceph_kick_flushing_caps(mdsc, s);
+ wake_up_session_caps(s, 1);
+ }
+ }
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ ceph_put_mds_session(di->lease_session);
+ di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->client->sb;
+ struct inode *inode;
+ struct ceph_mds_session *session;
+ struct ceph_inode_info *ci;
+ struct dentry *parent, *dentry;
+ struct ceph_dentry_info *di;
+ int mds;
+ struct ceph_mds_lease *h = msg->front.iov_base;
+ struct ceph_vino vino;
+ int mask;
+ struct qstr dname;
+ int release = 0;
+
+ if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+ return;
+ mds = le64_to_cpu(msg->hdr.src.name.num);
+ dout("handle_lease from mds%d\n", mds);
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+ goto bad;
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ mask = le16_to_cpu(h->mask);
+ dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+ dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+ if (dname.len != get_unaligned_le32(h+1))
+ goto bad;
+
+ /* find session */
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (!session) {
+ pr_err("handle_lease got lease but no session mds%d\n", mds);
+ return;
+ }
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+
+ /* lookup inode */
+ inode = ceph_find_inode(sb, vino);
+ dout("handle_lease '%s', mask %d, ino %llx %p\n",
+ ceph_lease_op_name(h->action), mask, vino.ino, inode);
+ if (inode == NULL) {
+ dout("handle_lease no inode %llx\n", vino.ino);
+ goto release;
+ }
+ ci = ceph_inode(inode);
+
+ /* dentry */
+ parent = d_find_alias(inode);
+ if (!parent) {
+ dout("no parent dentry on inode %p\n", inode);
+ WARN_ON(1);
+ goto release; /* hrm... */
+ }
+ dname.hash = full_name_hash(dname.name, dname.len);
+ dentry = d_lookup(parent, &dname);
+ dput(parent);
+ if (!dentry)
+ goto release;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ switch (h->action) {
+ case CEPH_MDS_LEASE_REVOKE:
+ if (di && di->lease_session == session) {
+ h->seq = cpu_to_le32(di->lease_seq);
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ }
+ release = 1;
+ break;
+
+ case CEPH_MDS_LEASE_RENEW:
+ if (di && di->lease_session == session &&
+ di->lease_gen == session->s_cap_gen &&
+ di->lease_renew_from &&
+ di->lease_renew_after == 0) {
+ unsigned long duration =
+ le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+ di->lease_seq = le32_to_cpu(h->seq);
+ dentry->d_time = di->lease_renew_from + duration;
+ di->lease_renew_after = di->lease_renew_from +
+ (duration >> 1);
+ di->lease_renew_from = 0;
+ }
+ break;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+
+ if (!release)
+ goto out;
+
+release:
+ /* let's just reuse the same message */
+ h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+ ceph_msg_get(msg);
+ ceph_con_send(&session->s_con, msg);
+
+out:
+ iput(inode);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ return;
+
+bad:
+ pr_err("corrupt lease message\n");
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_lease *lease;
+ int len = sizeof(*lease) + sizeof(u32);
+ int dnamelen = 0;
+
+ dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+ inode, dentry, ceph_lease_op_name(action), session->s_mds);
+ dnamelen = dentry->d_name.len;
+ len += dnamelen;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+ if (IS_ERR(msg))
+ return;
+ lease = msg->front.iov_base;
+ lease->action = action;
+ lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+ lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+ lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+ lease->seq = cpu_to_le32(seq);
+ put_unaligned_le32(dnamelen, lease + 1);
+ memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+ /*
+ * if this is a preemptive lease RELEASE, no need to
+ * flush request stream, since the actual request will
+ * soon follow.
+ */
+ msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+ ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+ struct dentry *dentry, int mask)
+{
+ struct ceph_dentry_info *di;
+ struct ceph_mds_session *session;
+ u32 seq;
+
+ BUG_ON(inode == NULL);
+ BUG_ON(dentry == NULL);
+ BUG_ON(mask != CEPH_LOCK_DN);
+
+ /* is dentry lease valid? */
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (!di || !di->lease_session ||
+ di->lease_session->s_mds < 0 ||
+ di->lease_gen != di->lease_session->s_cap_gen ||
+ !time_before(jiffies, dentry->d_time)) {
+ dout("lease_release inode %p dentry %p -- "
+ "no lease on %d\n",
+ inode, dentry, mask);
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+
+ /* we do have a lease on this dentry; note mds and seq */
+ session = ceph_get_mds_session(di->lease_session);
+ seq = di->lease_seq;
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ spin_unlock(&dentry->d_lock);
+
+ dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+ inode, dentry, mask, session->s_mds);
+ ceph_mdsc_lease_send_msg(session, inode, dentry,
+ CEPH_MDS_LEASE_RELEASE, seq);
+ ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+ int i;
+
+ dout("drop_leases\n");
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+ int delay = 5;
+ unsigned hz = round_jiffies_relative(HZ * delay);
+ schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+ int i;
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, delayed_work.work);
+ int renew_interval;
+ int renew_caps;
+
+ dout("mdsc delayed_work\n");
+ ceph_check_delayed_caps(mdsc);
+
+ mutex_lock(&mdsc->mutex);
+ renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+ renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+ mdsc->last_renew_caps);
+ if (renew_caps)
+ mdsc->last_renew_caps = jiffies;
+
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+ if (s == NULL)
+ continue;
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ dout("resending session close request for mds%d\n",
+ s->s_mds);
+ request_close_session(mdsc, s);
+ ceph_put_mds_session(s);
+ continue;
+ }
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info("mds%d hung\n", s->s_mds);
+ }
+ }
+ if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+ /* this mds is failed or recovering, just wait */
+ ceph_put_mds_session(s);
+ continue;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ if (renew_caps)
+ send_renew_caps(mdsc, s);
+ else
+ ceph_con_keepalive(&s->s_con);
+ add_cap_releases(mdsc, s, -1);
+ send_cap_releases(mdsc, s);
+ mutex_unlock(&s->s_mutex);
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ schedule_delayed(mdsc);
+}
+
+
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+ mdsc->client = client;
+ mutex_init(&mdsc->mutex);
+ mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+ init_completion(&mdsc->safe_umount_waiters);
+ init_completion(&mdsc->session_close_waiters);
+ INIT_LIST_HEAD(&mdsc->waiting_for_map);
+ mdsc->sessions = NULL;
+ mdsc->max_sessions = 0;
+ mdsc->stopping = 0;
+ init_rwsem(&mdsc->snap_rwsem);
+ INIT_RADIX_TREE(&mdsc->snap_realms, GFP_NOFS);
+ INIT_LIST_HEAD(&mdsc->snap_empty);
+ spin_lock_init(&mdsc->snap_empty_lock);
+ mdsc->last_tid = 0;
+ INIT_RADIX_TREE(&mdsc->request_tree, GFP_NOFS);
+ INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+ mdsc->last_renew_caps = jiffies;
+ INIT_LIST_HEAD(&mdsc->cap_delay_list);
+ spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->snap_flush_list);
+ spin_lock_init(&mdsc->snap_flush_lock);
+ mdsc->cap_flush_seq = 0;
+ INIT_LIST_HEAD(&mdsc->cap_dirty);
+ mdsc->num_cap_flushing = 0;
+ spin_lock_init(&mdsc->cap_dirty_lock);
+ init_waitqueue_head(&mdsc->cap_flushing_wq);
+ spin_lock_init(&mdsc->dentry_lru_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_lru);
+ return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests. If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_request *req;
+ struct ceph_client *client = mdsc->client;
+
+ mutex_lock(&mdsc->mutex);
+ if (__get_oldest_tid(mdsc)) {
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_requests waiting for requests\n");
+ wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+ client->mount_args->mount_timeout * HZ);
+ mutex_lock(&mdsc->mutex);
+
+ /* tear down remaining requests */
+ while (radix_tree_gang_lookup(&mdsc->request_tree,
+ (void **)&req, 0, 1)) {
+ dout("wait_requests timed out on tid %llu\n",
+ req->r_tid);
+ radix_tree_delete(&mdsc->request_tree, req->r_tid);
+ ceph_mdsc_put_request(req);
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+ dout("pre_umount\n");
+ mdsc->stopping = 1;
+
+ drop_leases(mdsc);
+ ceph_flush_dirty_caps(mdsc);
+ wait_requests(mdsc);
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+ struct ceph_mds_request *req;
+ u64 next_tid = 0;
+ int got;
+
+ mutex_lock(&mdsc->mutex);
+ dout("wait_unsafe_requests want %lld\n", want_tid);
+ while (1) {
+ got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req,
+ next_tid, 1);
+ if (!got)
+ break;
+ if (req->r_tid > want_tid)
+ break;
+
+ next_tid = req->r_tid + 1;
+ if ((req->r_op & CEPH_MDS_OP_WRITE) == 0)
+ continue; /* not a write op */
+
+ ceph_mdsc_get_request(req);
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+ req->r_tid, want_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&mdsc->mutex);
+ ceph_mdsc_put_request(req);
+ }
+ mutex_unlock(&mdsc->mutex);
+ dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+ u64 want_tid, want_flush;
+
+ dout("sync\n");
+ mutex_lock(&mdsc->mutex);
+ want_tid = mdsc->last_tid;
+ want_flush = mdsc->cap_flush_seq;
+ mutex_unlock(&mdsc->mutex);
+ dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+ ceph_flush_dirty_caps(mdsc);
+
+ wait_unsafe_requests(mdsc, want_tid);
+ wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int i;
+ int n;
+ struct ceph_client *client = mdsc->client;
+ unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+
+ dout("close_sessions\n");
+
+ mutex_lock(&mdsc->mutex);
+
+ /* close sessions */
+ started = jiffies;
+ while (time_before(jiffies, started + timeout)) {
+ dout("closing sessions\n");
+ n = 0;
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ session = __ceph_lookup_mds_session(mdsc, i);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ n++;
+ }
+ if (n == 0)
+ break;
+
+ if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ break;
+
+ dout("waiting for sessions to close\n");
+ mutex_unlock(&mdsc->mutex);
+ wait_for_completion_timeout(&mdsc->session_close_waiters,
+ timeout);
+ mutex_lock(&mdsc->mutex);
+ }
+
+ /* tear down remaining sessions */
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ if (mdsc->sessions[i]) {
+ session = get_session(mdsc->sessions[i]);
+ unregister_session(mdsc, session);
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ remove_session_caps(session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ }
+
+ WARN_ON(!list_empty(&mdsc->cap_delay_list));
+
+ mutex_unlock(&mdsc->mutex);
+
+ ceph_cleanup_empty_realms(mdsc);
+
+ cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+ dout("stopped\n");
+}
+
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+ dout("stop\n");
+ cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ if (mdsc->mdsmap)
+ ceph_mdsmap_destroy(mdsc->mdsmap);
+ kfree(mdsc->sessions);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ u32 epoch;
+ u32 maplen;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mdsmap *newmap, *oldmap;
+ struct ceph_fsid fsid;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+ return;
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+ /* do we need it? */
+ ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+ mutex_lock(&mdsc->mutex);
+ if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+ dout("handle_map epoch %u <= our %u\n",
+ epoch, mdsc->mdsmap->m_epoch);
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+
+ newmap = ceph_mdsmap_decode(&p, end);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad_unlock;
+ }
+
+ /* swap into place */
+ if (mdsc->mdsmap) {
+ oldmap = mdsc->mdsmap;
+ mdsc->mdsmap = newmap;
+ check_new_map(mdsc, newmap, oldmap);
+ ceph_mdsmap_destroy(oldmap);
+ } else {
+ mdsc->mdsmap = newmap; /* first mds map */
+ }
+ mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+
+ mutex_unlock(&mdsc->mutex);
+ schedule_delayed(mdsc);
+ return;
+
+bad_unlock:
+ mutex_unlock(&mdsc->mutex);
+bad:
+ pr_err("error decoding mdsmap %d\n", err);
+ return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+
+ if (get_session(s)) {
+ dout("mdsc con_get %p %d -> %d\n", s,
+ atomic_read(&s->s_ref) - 1, atomic_read(&s->s_ref));
+ return con;
+ }
+ dout("mdsc con_get %p FAIL\n", s);
+ return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+
+ dout("mdsc con_put %p %d -> %d\n", s, atomic_read(&s->s_ref),
+ atomic_read(&s->s_ref) - 1);
+ ceph_put_mds_session(s);
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+
+ pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
+ s->s_mds);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_map(mdsc, msg);
+ break;
+ case CEPH_MSG_CLIENT_SESSION:
+ handle_session(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_REPLY:
+ handle_reply(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+ handle_forward(mdsc, msg);
+ break;
+ case CEPH_MSG_CLIENT_CAPS:
+ ceph_handle_caps(s, msg);
+ break;
+ case CEPH_MSG_CLIENT_SNAP:
+ ceph_handle_snap(mdsc, msg);
+ break;
+ case CEPH_MSG_CLIENT_LEASE:
+ handle_lease(mdsc, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ int ret = 0;
+
+ if (force_new && s->s_authorizer) {
+ ac->ops->destroy_authorizer(ac, s->s_authorizer);
+ s->s_authorizer = NULL;
+ }
+ if (s->s_authorizer == NULL) {
+ if (ac->ops->create_authorizer) {
+ ret = ac->ops->create_authorizer(
+ ac, CEPH_ENTITY_TYPE_MDS,
+ &s->s_authorizer,
+ &s->s_authorizer_buf,
+ &s->s_authorizer_buf_len,
+ &s->s_authorizer_reply_buf,
+ &s->s_authorizer_reply_buf_len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ *proto = ac->protocol;
+ *buf = s->s_authorizer_buf;
+ *len = s->s_authorizer_buf_len;
+ *reply_buf = s->s_authorizer_reply_buf;
+ *reply_len = s->s_authorizer_reply_buf_len;
+ return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+ return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+
+const static struct ceph_connection_operations mds_con_ops = {
+ .get = con_get,
+ .put = con_put,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .peer_reset = peer_reset,
+ .alloc_msg = ceph_alloc_msg,
+ .alloc_middle = ceph_alloc_middle,
+};
+
+
+
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..9faa1b2f79a7
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,327 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/spinlock.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ * mdsc->mutex
+ *
+ * mdsc->snap_rwsem
+ *
+ * inode->i_lock
+ * mdsc->snap_flush_lock
+ * mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode. pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+ struct ceph_mds_reply_inode *in;
+ u32 symlink_len;
+ char *symlink;
+ u32 xattr_len;
+ char *xattr_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+ struct ceph_mds_reply_head *head;
+
+ struct ceph_mds_reply_info_in diri, targeti;
+ struct ceph_mds_reply_dirfrag *dirfrag;
+ char *dname;
+ u32 dname_len;
+ struct ceph_mds_reply_lease *dlease;
+
+ struct ceph_mds_reply_dirfrag *dir_dir;
+ int dir_nr;
+ char **dir_dname;
+ u32 *dir_dname_len;
+ struct ceph_mds_reply_lease **dir_dlease;
+ struct ceph_mds_reply_info_in *dir_in;
+ u8 dir_complete, dir_end;
+
+ /* encoded blob describing snapshot contexts for certain
+ operations (e.g., open) */
+ void *snapblob;
+ int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
+ sizeof(struct ceph_mds_cap_release)) / \
+ sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+ CEPH_MDS_SESSION_NEW = 1,
+ CEPH_MDS_SESSION_OPENING = 2,
+ CEPH_MDS_SESSION_OPEN = 3,
+ CEPH_MDS_SESSION_HUNG = 4,
+ CEPH_MDS_SESSION_CLOSING = 5,
+ CEPH_MDS_SESSION_RESTARTING = 6,
+ CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+ struct ceph_mds_client *s_mdsc;
+ int s_mds;
+ int s_state;
+ unsigned long s_ttl; /* time until mds kills us */
+ u64 s_seq; /* incoming msg seq # */
+ struct mutex s_mutex; /* serialize session messages */
+
+ struct ceph_connection s_con;
+
+ struct ceph_authorizer *s_authorizer;
+ void *s_authorizer_buf, *s_authorizer_reply_buf;
+ size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
+ u32 s_cap_gen; /* inc each time we get mds stale msg */
+ unsigned long s_cap_ttl; /* when session caps expire */
+ struct list_head s_caps; /* all caps issued by this session */
+ int s_nr_caps, s_trim_caps;
+ int s_num_cap_releases;
+ struct list_head s_cap_releases; /* waiting cap_release messages */
+ struct list_head s_cap_releases_done; /* ready to send */
+
+ /* protected by mutex */
+ struct list_head s_cap_flushing; /* inodes w/ flushing caps */
+ struct list_head s_cap_snaps_flushing;
+ unsigned long s_renew_requested; /* last time we sent a renew req */
+ u64 s_renew_seq;
+
+ atomic_t s_ref;
+ struct list_head s_waiting; /* waiting requests */
+ struct list_head s_unsafe; /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+ USE_ANY_MDS,
+ USE_RANDOM_MDS,
+ USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+ u64 r_tid; /* transaction id */
+
+ int r_op; /* mds op code */
+ int r_mds;
+
+ /* operation on what? */
+ struct inode *r_inode; /* arg1 */
+ struct dentry *r_dentry; /* arg1 */
+ struct dentry *r_old_dentry; /* arg2: rename from or link from */
+ char *r_path1, *r_path2;
+ struct ceph_vino r_ino1, r_ino2;
+
+ struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+ struct inode *r_target_inode; /* resulting inode */
+
+ union ceph_mds_request_args r_args;
+ int r_fmode; /* file mode, if expecting cap */
+
+ /* for choosing which mds to send this request to */
+ int r_direct_mode;
+ u32 r_direct_hash; /* choose dir frag based on this dentry hash */
+ bool r_direct_is_hash; /* true if r_direct_hash is valid */
+
+ /* data payload is used for xattr ops */
+ struct page **r_pages;
+ int r_num_pages;
+ int r_data_len;
+
+ /* what caps shall we drop? */
+ int r_inode_drop, r_inode_unless;
+ int r_dentry_drop, r_dentry_unless;
+ int r_old_dentry_drop, r_old_dentry_unless;
+ struct inode *r_old_inode;
+ int r_old_inode_drop, r_old_inode_unless;
+
+ struct ceph_msg *r_request; /* original request */
+ struct ceph_msg *r_reply;
+ struct ceph_mds_reply_info_parsed r_reply_info;
+ int r_err;
+
+ unsigned long r_timeout; /* optional. jiffies */
+ unsigned long r_started; /* start time to measure timeout against */
+ unsigned long r_request_started; /* start time for mds request only,
+ used to measure lease durations */
+
+ /* link unsafe requests to parent directory, for fsync */
+ struct inode *r_unsafe_dir;
+ struct list_head r_unsafe_dir_item;
+
+ struct ceph_mds_session *r_session;
+
+ int r_attempts; /* resend attempts */
+ int r_num_fwd; /* number of forward attempts */
+ int r_num_stale;
+ int r_resend_mds; /* mds to resend to next, if any*/
+
+ atomic_t r_ref;
+ struct list_head r_wait;
+ struct completion r_completion;
+ struct completion r_safe_completion;
+ ceph_mds_request_callback_t r_callback;
+ struct list_head r_unsafe_item; /* per-session unsafe list item */
+ bool r_got_unsafe, r_got_safe;
+
+ bool r_did_prepopulate;
+ u32 r_readdir_offset;
+
+ struct ceph_cap_reservation r_caps_reservation;
+ int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+ struct ceph_client *client;
+ struct mutex mutex; /* all nested structures */
+
+ struct ceph_mdsmap *mdsmap;
+ struct completion safe_umount_waiters, session_close_waiters;
+ struct list_head waiting_for_map;
+
+ struct ceph_mds_session **sessions; /* NULL for mds if no session */
+ int max_sessions; /* len of s_mds_sessions */
+ int stopping; /* true if shutting down */
+
+ /*
+ * snap_rwsem will cover cap linkage into snaprealms, and
+ * realm snap contexts. (later, we can do per-realm snap
+ * contexts locks..) the empty list contains realms with no
+ * references (implying they contain no inodes with caps) that
+ * should be destroyed.
+ */
+ struct rw_semaphore snap_rwsem;
+ struct radix_tree_root snap_realms;
+ struct list_head snap_empty;
+ spinlock_t snap_empty_lock; /* protect snap_empty */
+
+ u64 last_tid; /* most recent mds request */
+ struct radix_tree_root request_tree; /* pending mds requests */
+ struct delayed_work delayed_work; /* delayed work */
+ unsigned long last_renew_caps; /* last time we renewed our caps */
+ struct list_head cap_delay_list; /* caps with delayed release */
+ spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head snap_flush_list; /* cap_snaps ready to flush */
+ spinlock_t snap_flush_lock;
+
+ u64 cap_flush_seq;
+ struct list_head cap_dirty; /* inodes with dirty caps */
+ int num_cap_flushing; /* # caps we are flushing */
+ spinlock_t cap_dirty_lock; /* protects above items */
+ wait_queue_head_t cap_flushing_wq;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+
+ spinlock_t dentry_lru_lock;
+ struct list_head dentry_lru;
+ int num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+ atomic_inc(&s->s_ref);
+ return s;
+}
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+ struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+ struct inode *inode,
+ struct dentry *dn, int mask);
+
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+ atomic_inc(&req->r_ref);
+}
+extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+ int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..cad8d25861e5
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,170 @@
+#include "ceph_debug.h"
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int n = 0;
+ int i;
+ char r;
+
+ /* count */
+ for (i = 0; i < m->m_max_mds; i++)
+ if (m->m_info[i].state > 0)
+ n++;
+ if (n == 0)
+ return -1;
+
+ /* pick */
+ get_random_bytes(&r, 1);
+ n = r % n;
+ i = 0;
+ for (i = 0; n > 0; i++, n--)
+ while (m->m_info[i].state <= 0)
+ i++;
+
+ return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+ struct ceph_mdsmap *m;
+ int i, j, n;
+ int err = -EINVAL;
+ u16 version;
+
+ m = kzalloc(sizeof(*m), GFP_NOFS);
+ if (m == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ceph_decode_16_safe(p, end, version, bad);
+
+ ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+ m->m_epoch = ceph_decode_32(p);
+ m->m_client_epoch = ceph_decode_32(p);
+ m->m_last_failure = ceph_decode_32(p);
+ m->m_root = ceph_decode_32(p);
+ m->m_session_timeout = ceph_decode_32(p);
+ m->m_session_autoclose = ceph_decode_32(p);
+ m->m_max_file_size = ceph_decode_64(p);
+ m->m_max_mds = ceph_decode_32(p);
+
+ m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+ if (m->m_info == NULL)
+ goto badmem;
+
+ /* pick out active nodes from mds_info (state > 0) */
+ n = ceph_decode_32(p);
+ for (i = 0; i < n; i++) {
+ u64 global_id;
+ u32 namelen;
+ s32 mds, inc, state;
+ u64 state_seq;
+ u8 infoversion;
+ struct ceph_entity_addr addr;
+ u32 num_export_targets;
+ void *pexport_targets = NULL;
+
+ ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+ global_id = ceph_decode_64(p);
+ infoversion = ceph_decode_8(p);
+ *p += sizeof(u64);
+ namelen = ceph_decode_32(p); /* skip mds name */
+ *p += namelen;
+
+ ceph_decode_need(p, end,
+ 4*sizeof(u32) + sizeof(u64) +
+ sizeof(addr) + sizeof(struct ceph_timespec),
+ bad);
+ mds = ceph_decode_32(p);
+ inc = ceph_decode_32(p);
+ state = ceph_decode_32(p);
+ state_seq = ceph_decode_64(p);
+ ceph_decode_copy(p, &addr, sizeof(addr));
+ ceph_decode_addr(&addr);
+ *p += sizeof(struct ceph_timespec);
+ *p += sizeof(u32);
+ ceph_decode_32_safe(p, end, namelen, bad);
+ *p += namelen;
+ if (infoversion >= 2) {
+ ceph_decode_32_safe(p, end, num_export_targets, bad);
+ pexport_targets = *p;
+ *p += num_export_targets * sizeof(u32);
+ } else {
+ num_export_targets = 0;
+ }
+
+ dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+ i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+ ceph_mds_state_name(state));
+ if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+ m->m_info[mds].global_id = global_id;
+ m->m_info[mds].state = state;
+ m->m_info[mds].addr = addr;
+ m->m_info[mds].num_export_targets = num_export_targets;
+ if (num_export_targets) {
+ m->m_info[mds].export_targets =
+ kcalloc(num_export_targets, sizeof(u32),
+ GFP_NOFS);
+ for (j = 0; j < num_export_targets; j++)
+ m->m_info[mds].export_targets[j] =
+ ceph_decode_32(&pexport_targets);
+ } else {
+ m->m_info[mds].export_targets = NULL;
+ }
+ }
+ }
+
+ /* pg_pools */
+ ceph_decode_32_safe(p, end, n, bad);
+ m->m_num_data_pg_pools = n;
+ m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+ if (!m->m_data_pg_pools)
+ goto badmem;
+ ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+ for (i = 0; i < n; i++)
+ m->m_data_pg_pools[i] = ceph_decode_32(p);
+ m->m_cas_pg_pool = ceph_decode_32(p);
+
+ /* ok, we don't care about the rest. */
+ dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+ return m;
+
+badmem:
+ err = -ENOMEM;
+bad:
+ pr_err("corrupt mdsmap\n");
+ ceph_mdsmap_destroy(m);
+ return ERR_PTR(-EINVAL);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+ int i;
+
+ for (i = 0; i < m->m_max_mds; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ kfree(m->m_data_pg_pools);
+ kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ u32 m_max_mds; /* size of m_addr, m_state arrays */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u32 *m_data_pg_pools;
+ u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->m_max_mds)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->m_max_mds)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..45cec31fdf5e
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2103 @@
+#include "ceph_debug.h"
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system. The messenger provides ordered and reliable
+ * delivery. We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error). Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+const char *ceph_name_type_str(int t)
+{
+ switch (t) {
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+ default: return "???";
+ }
+}
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+ int i;
+ char *s;
+ struct sockaddr_in *in4 = (void *)ss;
+ unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+ struct sockaddr_in6 *in6 = (void *)ss;
+
+ spin_lock(&addr_str_lock);
+ i = last_addr_str++;
+ if (last_addr_str == MAX_ADDR_STR)
+ last_addr_str = 0;
+ spin_unlock(&addr_str_lock);
+ s = addr_str[i];
+
+ switch (ss->ss_family) {
+ case AF_INET:
+ sprintf(s, "%u.%u.%u.%u:%u",
+ (unsigned int)quad[0],
+ (unsigned int)quad[1],
+ (unsigned int)quad[2],
+ (unsigned int)quad[3],
+ (unsigned int)ntohs(in4->sin_port));
+ break;
+
+ case AF_INET6:
+ sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+ in6->sin6_addr.s6_addr16[0],
+ in6->sin6_addr.s6_addr16[1],
+ in6->sin6_addr.s6_addr16[2],
+ in6->sin6_addr.s6_addr16[3],
+ in6->sin6_addr.s6_addr16[4],
+ in6->sin6_addr.s6_addr16[5],
+ in6->sin6_addr.s6_addr16[6],
+ in6->sin6_addr.s6_addr16[7],
+ (unsigned int)ntohs(in6->sin6_port));
+ break;
+
+ default:
+ sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+ }
+
+ return s;
+}
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+ ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int __init ceph_msgr_init(void)
+{
+ ceph_msgr_wq = create_workqueue("ceph-msgr");
+ if (IS_ERR(ceph_msgr_wq)) {
+ int ret = PTR_ERR(ceph_msgr_wq);
+ pr_err("msgr_init failed to create workqueue: %d\n", ret);
+ ceph_msgr_wq = NULL;
+ return ret;
+ }
+ return 0;
+}
+
+void ceph_msgr_exit(void)
+{
+ destroy_workqueue(ceph_msgr_wq);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+ if (sk->sk_state != TCP_CLOSE_WAIT) {
+ dout("ceph_data_ready on %p state = %lu, queueing work\n",
+ con, con->state);
+ queue_con(con);
+ }
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ /* only queue to workqueue if there is data we want to write. */
+ if (test_bit(WRITE_PENDING, &con->state)) {
+ dout("ceph_write_space %p queueing write work\n", con);
+ queue_con(con);
+ } else {
+ dout("ceph_write_space %p nothing to write\n", con);
+ }
+
+ /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ dout("ceph_state_change %p state = %lu sk_state = %u\n",
+ con, con->state, sk->sk_state);
+
+ if (test_bit(CLOSED, &con->state))
+ return;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ dout("ceph_state_change TCP_CLOSE\n");
+ case TCP_CLOSE_WAIT:
+ dout("ceph_state_change TCP_CLOSE_WAIT\n");
+ if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+ if (test_bit(CONNECTING, &con->state))
+ con->error_msg = "connection failed";
+ else
+ con->error_msg = "socket closed";
+ queue_con(con);
+ }
+ break;
+ case TCP_ESTABLISHED:
+ dout("ceph_state_change TCP_ESTABLISHED\n");
+ queue_con(con);
+ break;
+ }
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+ struct ceph_connection *con)
+{
+ struct sock *sk = sock->sk;
+ sk->sk_user_data = (void *)con;
+ sk->sk_data_ready = ceph_data_ready;
+ sk->sk_write_space = ceph_write_space;
+ sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+ struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+ struct socket *sock;
+ int ret;
+
+ BUG_ON(con->sock);
+ ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret)
+ return ERR_PTR(ret);
+ con->sock = sock;
+ sock->sk->sk_allocation = GFP_NOFS;
+
+ set_sock_callbacks(sock, con);
+
+ dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+
+ ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+ if (ret == -EINPROGRESS) {
+ dout("connect %s EINPROGRESS sk_state = %u\n",
+ pr_addr(&con->peer_addr.in_addr),
+ sock->sk->sk_state);
+ ret = 0;
+ }
+ if (ret < 0) {
+ pr_err("connect %s error %d\n",
+ pr_addr(&con->peer_addr.in_addr), ret);
+ sock_release(sock);
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+ return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, int more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+ int rc;
+
+ dout("con_close_socket on %p sock %p\n", con, con->sock);
+ if (!con->sock)
+ return 0;
+ set_bit(SOCK_CLOSED, &con->state);
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ clear_bit(SOCK_CLOSED, &con->state);
+ return rc;
+}
+
+/*
+ * Reset a connection. Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+ list_head);
+ ceph_msg_remove(msg);
+ }
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+ /* reset connection, out_queue, msg_ and connect_seq */
+ /* discard existing out_queue and msg_seq */
+ mutex_lock(&con->out_mutex);
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+
+ con->connect_seq = 0;
+ con->out_seq = 0;
+ con->out_msg = NULL;
+ con->in_seq = 0;
+ mutex_unlock(&con->out_mutex);
+}
+
+/*
+ * mark a peer down. drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+ dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
+ reset_connection(con);
+ queue_con(con);
+}
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+ dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+ set_bit(OPENING, &con->state);
+ clear_bit(CLOSED, &con->state);
+ memcpy(&con->peer_addr, addr, sizeof(*addr));
+ con->delay = 0; /* reset backoff memory */
+ queue_con(con);
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+ dout("con_get %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+ if (atomic_inc_not_zero(&con->nref))
+ return con;
+ return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+ dout("con_put %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+ BUG_ON(atomic_read(&con->nref) == 0);
+ if (atomic_dec_and_test(&con->nref)) {
+ BUG_ON(con->sock);
+ kfree(con);
+ }
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+ dout("con_init %p\n", con);
+ memset(con, 0, sizeof(*con));
+ atomic_set(&con->nref, 1);
+ con->msgr = msgr;
+ mutex_init(&con->out_mutex);
+ INIT_LIST_HEAD(&con->out_queue);
+ INIT_LIST_HEAD(&con->out_sent);
+ INIT_DELAYED_WORK(&con->work, con_work);
+}
+
+
+/*
+ * We maintain a global counter to order connection attempts. Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+ u32 ret;
+
+ spin_lock(&msgr->global_seq_lock);
+ if (msgr->global_seq < gt)
+ msgr->global_seq = gt;
+ ret = ++msgr->global_seq;
+ spin_unlock(&msgr->global_seq_lock);
+ return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+ struct ceph_msg *m = con->out_msg;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con->out_kvec_is_msg = true;
+ con->out_kvec[v].iov_base = &m->footer;
+ con->out_kvec[v].iov_len = sizeof(m->footer);
+ con->out_kvec_bytes += sizeof(m->footer);
+ con->out_kvec_left++;
+ con->out_more = m->more_to_follow;
+ con->out_msg = NULL; /* we're done with this one */
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ int v = 0;
+
+ con->out_kvec_bytes = 0;
+ con->out_kvec_is_msg = true;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con->out_kvec[v].iov_base = &tag_ack;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[v].iov_base = &con->out_temp_ack;
+ con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ }
+
+ /* move message to sending/sent list */
+ m = list_first_entry(&con->out_queue,
+ struct ceph_msg, list_head);
+ list_move_tail(&m->list_head, &con->out_sent);
+ con->out_msg = m; /* we don't bother taking a reference here. */
+
+ m->hdr.seq = cpu_to_le64(++con->out_seq);
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ le32_to_cpu(m->hdr.data_len),
+ m->nr_pages);
+ BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+ /* tag + hdr + front + middle */
+ con->out_kvec[v].iov_base = &tag_msg;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_kvec[v].iov_base = &m->hdr;
+ con->out_kvec[v++].iov_len = sizeof(m->hdr);
+ con->out_kvec[v++] = m->front;
+ if (m->middle)
+ con->out_kvec[v++] = m->middle->vec;
+ con->out_kvec_left = v;
+ con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+ (m->middle ? m->middle->vec.iov_len : 0);
+ con->out_kvec_cur = con->out_kvec;
+
+ /* fill in crc (except data pages), footer */
+ con->out_msg->hdr.crc =
+ cpu_to_le32(crc32c(0, (void *)&m->hdr,
+ sizeof(m->hdr) - sizeof(m->hdr.crc)));
+ con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+ con->out_msg->footer.front_crc =
+ cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+ if (m->middle)
+ con->out_msg->footer.middle_crc =
+ cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len));
+ else
+ con->out_msg->footer.middle_crc = 0;
+ con->out_msg->footer.data_crc = 0;
+ dout("prepare_write_message front_crc %u data_crc %u\n",
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+
+ /* is there a data payload? */
+ if (le32_to_cpu(m->hdr.data_len) > 0) {
+ /* initialize page iterator */
+ con->out_msg_pos.page = 0;
+ con->out_msg_pos.page_pos =
+ le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+ con->out_msg_pos.data_pos = 0;
+ con->out_msg_pos.did_page_crc = 0;
+ con->out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con, v);
+ }
+
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con->out_kvec[0].iov_base = &tag_ack;
+ con->out_kvec[0].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[1].iov_base = &con->out_temp_ack;
+ con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 1; /* more will follow.. eventually.. */
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con->out_kvec[0].iov_base = &tag_keepalive;
+ con->out_kvec[0].iov_len = 1;
+ con->out_kvec_left = 1;
+ con->out_kvec_bytes = 1;
+ con->out_kvec_cur = con->out_kvec;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+ void *auth_buf;
+ int auth_len = 0;
+ int auth_protocol = 0;
+
+ if (con->ops->get_authorizer)
+ con->ops->get_authorizer(con, &auth_buf, &auth_len,
+ &auth_protocol, &con->auth_reply_buf,
+ &con->auth_reply_buf_len,
+ con->auth_retry);
+
+ con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+ con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+ con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+ con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+ con->out_kvec_left++;
+ con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+ struct ceph_connection *con)
+{
+ int len = strlen(CEPH_BANNER);
+
+ con->out_kvec[0].iov_base = CEPH_BANNER;
+ con->out_kvec[0].iov_len = len;
+ con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+ con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+ struct ceph_connection *con,
+ int after_banner)
+{
+ unsigned global_seq = get_global_seq(con->msgr, 0);
+ int proto;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->connect_seq, global_seq, proto);
+
+ con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+ con->out_connect.global_seq = cpu_to_le32(global_seq);
+ con->out_connect.protocol_version = cpu_to_le32(proto);
+ con->out_connect.flags = 0;
+ if (test_bit(LOSSYTX, &con->state))
+ con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY;
+
+ if (!after_banner) {
+ con->out_kvec_left = 0;
+ con->out_kvec_bytes = 0;
+ }
+ con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+ con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+ con->out_kvec_left++;
+ con->out_kvec_bytes += sizeof(con->out_connect);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+
+ prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+ while (con->out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+ con->out_kvec_left, con->out_kvec_bytes,
+ con->out_more);
+ if (ret <= 0)
+ goto out;
+ con->out_kvec_bytes -= ret;
+ if (con->out_kvec_bytes == 0)
+ break; /* done */
+ while (ret > 0) {
+ if (ret >= con->out_kvec_cur->iov_len) {
+ ret -= con->out_kvec_cur->iov_len;
+ con->out_kvec_cur++;
+ con->out_kvec_left--;
+ } else {
+ con->out_kvec_cur->iov_len -= ret;
+ con->out_kvec_cur->iov_base += ret;
+ ret = 0;
+ break;
+ }
+ }
+ }
+ con->out_kvec_left = 0;
+ con->out_kvec_is_msg = false;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->out_kvec_bytes, con->out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+ size_t len;
+ int crc = con->msgr->nocrc;
+ int ret;
+
+ dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+ con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+ con->out_msg_pos.page_pos);
+
+ while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+ struct page *page = NULL;
+ void *kaddr = NULL;
+
+ /*
+ * if we are calculating the data crc (the default), we need
+ * to map the page. if our pages[] has been revoked, use the
+ * zero page.
+ */
+ if (msg->pages) {
+ page = msg->pages[con->out_msg_pos.page];
+ if (crc)
+ kaddr = kmap(page);
+ } else {
+ page = con->msgr->zero_page;
+ if (crc)
+ kaddr = page_address(con->msgr->zero_page);
+ }
+ len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+ (int)(data_len - con->out_msg_pos.data_pos));
+ if (crc && !con->out_msg_pos.did_page_crc) {
+ void *base = kaddr + con->out_msg_pos.page_pos;
+ u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+ BUG_ON(kaddr == NULL);
+ con->out_msg->footer.data_crc =
+ cpu_to_le32(crc32c(tmpcrc, base, len));
+ con->out_msg_pos.did_page_crc = 1;
+ }
+
+ ret = kernel_sendpage(con->sock, page,
+ con->out_msg_pos.page_pos, len,
+ MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_MORE);
+
+ if (crc && msg->pages)
+ kunmap(page);
+
+ if (ret <= 0)
+ goto out;
+
+ con->out_msg_pos.data_pos += ret;
+ con->out_msg_pos.page_pos += ret;
+ if (ret == len) {
+ con->out_msg_pos.page_pos = 0;
+ con->out_msg_pos.page++;
+ con->out_msg_pos.did_page_crc = 0;
+ }
+ }
+
+ dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+ /* prepare and queue up footer, too */
+ if (!crc)
+ con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con->out_kvec_bytes = 0;
+ con->out_kvec_left = 0;
+ con->out_kvec_cur = con->out_kvec;
+ prepare_write_message_footer(con, 0);
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ while (con->out_skip > 0) {
+ struct kvec iov = {
+ .iov_base = page_address(con->msgr->zero_page),
+ .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+ };
+
+ ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+ if (ret <= 0)
+ goto out;
+ con->out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect_retry(struct ceph_connection *con)
+{
+ dout("prepare_read_connect_retry %p\n", con);
+ con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
+ + sizeof(con->peer_addr_for_me);
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->in_base_pos = 0;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+ int *to, int size, void *object)
+{
+ *to += size;
+ while (con->in_base_pos < *to) {
+ int left = *to - con->in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+ /* peer's banner */
+ ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+ &con->actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+ &con->peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+ ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+ con->auth_reply_buf);
+ if (ret <= 0)
+ goto out;
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, (int)con->in_reply.tag,
+ le32_to_cpu(con->in_reply.connect_seq),
+ le32_to_cpu(con->in_reply.global_seq));
+out:
+ return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ pr_addr(&con->peer_addr.in_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+ case AF_INET6:
+ return
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+ }
+ return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+ }
+ return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ case AF_INET6:
+ ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ }
+}
+
+/*
+ * Parse an ip[:port] list into an addr array. Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count)
+{
+ int i;
+ const char *p = c;
+
+ dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+ for (i = 0; i < max_count; i++) {
+ const char *ipend;
+ struct sockaddr_storage *ss = &addr[i].in_addr;
+ struct sockaddr_in *in4 = (void *)ss;
+ struct sockaddr_in6 *in6 = (void *)ss;
+ int port;
+
+ memset(ss, 0, sizeof(*ss));
+ if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+ ',', &ipend)) {
+ ss->ss_family = AF_INET;
+ } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+ ',', &ipend)) {
+ ss->ss_family = AF_INET6;
+ } else {
+ goto bad;
+ }
+ p = ipend;
+
+ /* port? */
+ if (p < end && *p == ':') {
+ port = 0;
+ p++;
+ while (p < end && *p >= '0' && *p <= '9') {
+ port = (port * 10) + (*p - '0');
+ p++;
+ }
+ if (port > 65535 || port == 0)
+ goto bad;
+ } else {
+ port = CEPH_MON_PORT;
+ }
+
+ addr_set_port(ss, port);
+
+ dout("parse_ips got %s\n", pr_addr(ss));
+
+ if (p == end)
+ break;
+ if (*p != ',')
+ goto bad;
+ p++;
+ }
+
+ if (p != end)
+ goto bad;
+
+ if (count)
+ *count = i + 1;
+ return 0;
+
+bad:
+ pr_err("parse_ips bad ip '%s'\n", c);
+ return -EINVAL;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ ceph_decode_addr(&con->actual_peer_addr);
+ ceph_decode_addr(&con->peer_addr_for_me);
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (!ceph_entity_addr_is_local(&con->peer_addr,
+ &con->actual_peer_addr) &&
+ !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+ con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_err("wrong peer, want %s/%d, "
+ "got %s/%d, wtf\n",
+ pr_addr(&con->peer_addr.in_addr),
+ con->peer_addr.nonce,
+ pr_addr(&con->actual_peer_addr.in_addr),
+ con->actual_peer_addr.nonce);
+ con->error_msg = "protocol error, wrong peer";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+ int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+ memcpy(&con->msgr->inst.addr.in_addr,
+ &con->peer_addr_for_me.in_addr,
+ sizeof(con->peer_addr_for_me.in_addr));
+ addr_set_port(&con->msgr->inst.addr.in_addr, port);
+ encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ pr_addr(&con->msgr->inst.addr.in_addr));
+ }
+
+ set_bit(NEGOTIATING, &con->state);
+ prepare_read_connect(con);
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+ switch (con->in_reply.tag) {
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ dout("process_connect got BADPROTOVER my %d != their %d\n",
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr),
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ if (con->ops->bad_proto)
+ con->ops->bad_proto(con);
+ reset_connection(con);
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->auth_retry);
+ if (con->auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ reset_connection(con);
+ set_bit(CLOSED, &con->state);
+ return -1;
+ }
+ con->auth_retry = 1;
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect_retry(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->in_connect.connect_seq));
+ pr_err("%s%lld %s connection reset\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr));
+ reset_connection(con);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+ le32_to_cpu(con->out_connect.connect_seq),
+ le32_to_cpu(con->in_connect.connect_seq));
+ con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_connect.global_seq));
+ get_global_seq(con->msgr,
+ le32_to_cpu(con->in_connect.global_seq));
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_READY:
+ clear_bit(CONNECTING, &con->state);
+ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+ con->connect_seq++;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_reply.connect_seq),
+ con->connect_seq);
+ WARN_ON(con->connect_seq !=
+ le32_to_cpu(con->in_reply.connect_seq));
+ prepare_read_tag(con);
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ pr_err("process_connect peer connecting WAIT\n");
+
+ default:
+ pr_err("connect protocol error, will retry\n");
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int to = 0;
+
+ return read_partial(con, &to, sizeof(con->in_temp_ack),
+ &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u64 ack = le64_to_cpu(con->in_temp_ack);
+ u64 seq;
+
+ mutex_lock(&con->out_mutex);
+ while (!list_empty(&con->out_sent)) {
+ m = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ seq = le64_to_cpu(m->hdr.seq);
+ if (seq > ack)
+ break;
+ dout("got ack for seq %llu type %d at %p\n", seq,
+ le16_to_cpu(m->hdr.type), m);
+ ceph_msg_remove(m);
+ }
+ mutex_unlock(&con->out_mutex);
+ prepare_read_tag(con);
+}
+
+
+
+
+
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ void *p;
+ int ret;
+ int to, want, left;
+ unsigned front_len, middle_len, data_len, data_off;
+ int datacrc = con->msgr->nocrc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ while (con->in_base_pos < sizeof(con->in_hdr)) {
+ left = sizeof(con->in_hdr) - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock,
+ (char *)&con->in_hdr + con->in_base_pos,
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ if (con->in_base_pos == sizeof(con->in_hdr)) {
+ u32 crc = crc32c(0, (void *)&con->in_hdr,
+ sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+ if (crc != le32_to_cpu(con->in_hdr.crc)) {
+ pr_err("read_partial_message bad hdr "
+ " crc %u != expected %u\n",
+ crc, con->in_hdr.crc);
+ return -EBADMSG;
+ }
+ }
+ }
+
+ front_len = le32_to_cpu(con->in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+ con->in_hdr.front_len, con->in_hdr.data_len);
+ con->in_msg = con->ops->alloc_msg(con, &con->in_hdr);
+ if (!con->in_msg) {
+ /* skip this message */
+ dout("alloc_msg returned NULL, skipping message\n");
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ return 0;
+ }
+ if (IS_ERR(con->in_msg)) {
+ ret = PTR_ERR(con->in_msg);
+ con->in_msg = NULL;
+ con->error_msg = "out of memory for incoming message";
+ return ret;
+ }
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr));
+ }
+
+ /* front */
+ while (m->front.iov_len < front_len) {
+ BUG_ON(m->front.iov_base == NULL);
+ left = front_len - m->front.iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base +
+ m->front.iov_len, left);
+ if (ret <= 0)
+ return ret;
+ m->front.iov_len += ret;
+ if (m->front.iov_len == front_len)
+ con->in_front_crc = crc32c(0, m->front.iov_base,
+ m->front.iov_len);
+ }
+
+ /* middle */
+ while (middle_len > 0 && (!m->middle ||
+ m->middle->vec.iov_len < middle_len)) {
+ if (m->middle == NULL) {
+ ret = -EOPNOTSUPP;
+ if (con->ops->alloc_middle)
+ ret = con->ops->alloc_middle(con, m);
+ if (ret < 0) {
+ dout("alloc_middle failed, skipping payload\n");
+ con->in_base_pos = -middle_len - data_len
+ - sizeof(m->footer);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ return 0;
+ }
+ m->middle->vec.iov_len = 0;
+ }
+ left = middle_len - m->middle->vec.iov_len;
+ ret = ceph_tcp_recvmsg(con->sock,
+ (char *)m->middle->vec.iov_base +
+ m->middle->vec.iov_len, left);
+ if (ret <= 0)
+ return ret;
+ m->middle->vec.iov_len += ret;
+ if (m->middle->vec.iov_len == middle_len)
+ con->in_middle_crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ }
+
+ /* (page) data */
+ data_off = le16_to_cpu(m->hdr.data_off);
+ if (data_len == 0)
+ goto no_data;
+
+ if (m->nr_pages == 0) {
+ con->in_msg_pos.page = 0;
+ con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+ con->in_msg_pos.data_pos = 0;
+ /* find pages for data payload */
+ want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+ ret = -1;
+ if (con->ops->prepare_pages)
+ ret = con->ops->prepare_pages(con, m, want);
+ if (ret < 0) {
+ dout("%p prepare_pages failed, skipping payload\n", m);
+ con->in_base_pos = -data_len - sizeof(m->footer);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ return 0;
+ }
+ BUG_ON(m->nr_pages < want);
+ }
+ while (con->in_msg_pos.data_pos < data_len) {
+ left = min((int)(data_len - con->in_msg_pos.data_pos),
+ (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+ BUG_ON(m->pages == NULL);
+ p = kmap(m->pages[con->in_msg_pos.page]);
+ ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+ left);
+ if (ret > 0 && datacrc)
+ con->in_data_crc =
+ crc32c(con->in_data_crc,
+ p + con->in_msg_pos.page_pos, ret);
+ kunmap(m->pages[con->in_msg_pos.page]);
+ if (ret <= 0)
+ return ret;
+ con->in_msg_pos.data_pos += ret;
+ con->in_msg_pos.page_pos += ret;
+ if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+ con->in_msg_pos.page_pos = 0;
+ con->in_msg_pos.page++;
+ }
+ }
+
+no_data:
+ /* footer */
+ to = sizeof(m->hdr) + sizeof(m->footer);
+ while (con->in_base_pos < to) {
+ left = to - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+ (con->in_base_pos - sizeof(m->hdr)),
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+/*
+ * Process message. This happens in the worker thread. The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ con->in_msg = NULL;
+
+ /* if first message, set peer_name */
+ if (con->peer_name.type == 0)
+ con->peer_name = msg->hdr.src.name;
+
+ mutex_lock(&con->out_mutex);
+ con->in_seq++;
+ mutex_unlock(&con->out_mutex);
+
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ msg, le64_to_cpu(msg->hdr.seq),
+ ENTITY_NAME(msg->hdr.src.name),
+ le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.data_len),
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+ con->ops->dispatch(con, msg);
+ prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+ struct ceph_messenger *msgr = con->msgr;
+ int ret = 1;
+
+ dout("try_write start %p state %lu nref %d\n", con, con->state,
+ atomic_read(&con->nref));
+
+ mutex_lock(&con->out_mutex);
+more:
+ dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+ /* open the socket first? */
+ if (con->sock == NULL) {
+ /*
+ * if we were STANDBY and are reconnecting _this_
+ * connection, bump connect_seq now. Always bump
+ * global_seq.
+ */
+ if (test_and_clear_bit(STANDBY, &con->state))
+ con->connect_seq++;
+
+ prepare_write_banner(msgr, con);
+ prepare_write_connect(msgr, con, 1);
+ prepare_read_banner(con);
+ set_bit(CONNECTING, &con->state);
+ clear_bit(NEGOTIATING, &con->state);
+
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %lu\n",
+ con, con->state);
+ con->sock = ceph_tcp_connect(con);
+ if (IS_ERR(con->sock)) {
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ ret = -1;
+ goto out;
+ }
+ }
+
+more_kvec:
+ /* kvec data queued? */
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_skip err %d\n", ret);
+ goto done;
+ }
+ }
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_kvec err %d\n", ret);
+ goto done;
+ }
+ }
+
+ /* msg pages? */
+ if (con->out_msg) {
+ ret = write_partial_msg_pages(con);
+ if (ret == 1)
+ goto more_kvec; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_msg_pages err %d\n",
+ ret);
+ goto done;
+ }
+ }
+
+ if (!test_bit(CONNECTING, &con->state)) {
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ clear_bit(WRITE_PENDING, &con->state);
+ dout("try_write nothing else to write.\n");
+done:
+ ret = 0;
+out:
+ mutex_unlock(&con->out_mutex);
+ dout("try_write done on %p\n", con);
+ return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+ struct ceph_messenger *msgr;
+ int ret = -1;
+
+ if (!con->sock)
+ return 0;
+
+ if (test_bit(STANDBY, &con->state))
+ return 0;
+
+ dout("try_read start on %p\n", con);
+ msgr = con->msgr;
+
+more:
+ dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+ con->in_base_pos);
+ if (test_bit(CONNECTING, &con->state)) {
+ if (!test_bit(NEGOTIATING, &con->state)) {
+ dout("try_read connecting\n");
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto done;
+ if (process_banner(con) < 0) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto done;
+ if (process_connect(con) < 0) {
+ ret = -1;
+ goto out;
+ }
+ goto more;
+ }
+
+ if (con->in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ *
+ * FIXME: there must be a better way to do this!
+ */
+ static char buf[1024];
+ int skip = min(1024, -con->in_base_pos);
+ dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+ ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+ if (ret <= 0)
+ goto done;
+ con->in_base_pos += ret;
+ if (con->in_base_pos)
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+ if (ret <= 0)
+ goto done;
+ dout("try_read got tag %d\n", (int)con->in_tag);
+ switch (con->in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ set_bit(CLOSED, &con->state); /* fixme */
+ goto done;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc";
+ ret = -EIO;
+ goto out;
+ case -EIO:
+ con->error_msg = "io error";
+ goto out;
+ default:
+ goto done;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ process_message(con);
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto done;
+ process_ack(con);
+ goto more;
+ }
+
+done:
+ ret = 0;
+out:
+ dout("try_read done on %p\n", con);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection. Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY. It
+ * clears QUEUED and does it's thing. When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work. If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+ if (test_bit(DEAD, &con->state)) {
+ dout("queue_con %p ignoring: DEAD\n",
+ con);
+ return;
+ }
+
+ if (!con->ops->get(con)) {
+ dout("queue_con %p ref count 0\n", con);
+ return;
+ }
+
+ set_bit(QUEUED, &con->state);
+ if (test_bit(BUSY, &con->state)) {
+ dout("queue_con %p - already BUSY\n", con);
+ con->ops->put(con);
+ } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+ dout("queue_con %p - already queued\n", con);
+ con->ops->put(con);
+ } else {
+ dout("queue_con %p\n", con);
+ }
+}
+
+/*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+ struct ceph_connection *con = container_of(work, struct ceph_connection,
+ work.work);
+ int backoff = 0;
+
+more:
+ if (test_and_set_bit(BUSY, &con->state) != 0) {
+ dout("con_work %p BUSY already set\n", con);
+ goto out;
+ }
+ dout("con_work %p start, clearing QUEUED\n", con);
+ clear_bit(QUEUED, &con->state);
+
+ if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+ dout("con_work CLOSED\n");
+ con_close_socket(con);
+ goto done;
+ }
+ if (test_and_clear_bit(OPENING, &con->state)) {
+ /* reopen w/ new peer */
+ dout("con_work OPENING\n");
+ con_close_socket(con);
+ }
+
+ if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+ try_read(con) < 0 ||
+ try_write(con) < 0) {
+ backoff = 1;
+ ceph_fault(con); /* error/fault path */
+ }
+
+done:
+ clear_bit(BUSY, &con->state);
+ dout("con->state=%lu\n", con->state);
+ if (test_bit(QUEUED, &con->state)) {
+ if (!backoff) {
+ dout("con_work %p QUEUED reset, looping\n", con);
+ goto more;
+ }
+ dout("con_work %p QUEUED reset, but just faulted\n", con);
+ clear_bit(QUEUED, &con->state);
+ }
+ dout("con_work %p done\n", con);
+
+out:
+ con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler. A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+ pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, pr_addr(&con->peer_addr.in_addr));
+
+ if (test_bit(LOSSYTX, &con->state)) {
+ dout("fault on LOSSYTX channel\n");
+ goto out;
+ }
+
+ clear_bit(BUSY, &con->state); /* to avoid an improbable race */
+
+ con_close_socket(con);
+ con->in_msg = NULL;
+
+ /* If there are no messages in the queue, place the connection
+ * in a STANDBY state (i.e., don't try to reconnect just yet). */
+ mutex_lock(&con->out_mutex);
+ if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+ dout("fault setting STANDBY\n");
+ set_bit(STANDBY, &con->state);
+ mutex_unlock(&con->out_mutex);
+ goto out;
+ }
+
+ /* Requeue anything that hasn't been acked, and retry after a
+ * delay. */
+ list_splice_init(&con->out_sent, &con->out_queue);
+ mutex_unlock(&con->out_mutex);
+
+ if (con->delay == 0)
+ con->delay = BASE_DELAY_INTERVAL;
+ else if (con->delay < MAX_DELAY_INTERVAL)
+ con->delay *= 2;
+
+ /* explicitly schedule work to try to reconnect again later. */
+ dout("fault queueing %p delay %lu\n", con, con->delay);
+ con->ops->get(con);
+ if (queue_delayed_work(ceph_msgr_wq, &con->work,
+ round_jiffies_relative(con->delay)) == 0)
+ con->ops->put(con);
+
+out:
+ if (con->ops->fault)
+ con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+ struct ceph_messenger *msgr;
+
+ msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+ if (msgr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&msgr->global_seq_lock);
+
+ /* the zero page is needed if a request is "canceled" while the message
+ * is being written over the socket */
+ msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!msgr->zero_page) {
+ kfree(msgr);
+ return ERR_PTR(-ENOMEM);
+ }
+ kmap(msgr->zero_page);
+
+ if (myaddr)
+ msgr->inst.addr = *myaddr;
+
+ /* select a random nonce */
+ get_random_bytes(&msgr->inst.addr.nonce,
+ sizeof(msgr->inst.addr.nonce));
+ encode_my_addr(msgr);
+
+ dout("messenger_create %p\n", msgr);
+ return msgr;
+}
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+ dout("destroy %p\n", msgr);
+ kunmap(msgr->zero_page);
+ __free_page(msgr->zero_page);
+ kfree(msgr);
+ dout("destroyed messenger %p\n", msgr);
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ if (test_bit(CLOSED, &con->state)) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ return;
+ }
+
+ /* set src+dst */
+ msg->hdr.src.name = con->msgr->inst.name;
+ msg->hdr.src.addr = con->msgr->my_enc_addr;
+ msg->hdr.orig_src = msg->hdr.src;
+ msg->hdr.dst_erank = con->peer_addr.erank;
+
+ /* queue */
+ mutex_lock(&con->out_mutex);
+ BUG_ON(!list_empty(&msg->list_head));
+ list_add_tail(&msg->list_head, &con->out_queue);
+ dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+ ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len));
+ mutex_unlock(&con->out_mutex);
+
+ /* if there wasn't anything waiting to send before, queue
+ * new work */
+ if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ mutex_lock(&con->out_mutex);
+ if (!list_empty(&msg->list_head)) {
+ dout("con_revoke %p msg %p\n", con, msg);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
+ if (con->out_msg == msg)
+ con->out_msg = NULL;
+ if (con->out_kvec_is_msg) {
+ con->out_skip = con->out_kvec_bytes;
+ con->out_kvec_is_msg = false;
+ }
+ } else {
+ dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+ }
+ mutex_unlock(&con->out_mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+ if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+ test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+ int page_len, int page_off, struct page **pages)
+{
+ struct ceph_msg *m;
+
+ m = kmalloc(sizeof(*m), GFP_NOFS);
+ if (m == NULL)
+ goto out;
+ atomic_set(&m->nref, 1);
+ INIT_LIST_HEAD(&m->list_head);
+
+ m->hdr.type = cpu_to_le16(type);
+ m->hdr.front_len = cpu_to_le32(front_len);
+ m->hdr.middle_len = 0;
+ m->hdr.data_len = cpu_to_le32(page_len);
+ m->hdr.data_off = cpu_to_le16(page_off);
+ m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+ m->footer.front_crc = 0;
+ m->footer.middle_crc = 0;
+ m->footer.data_crc = 0;
+ m->front_max = front_len;
+ m->front_is_vmalloc = false;
+ m->more_to_follow = false;
+ m->pool = NULL;
+
+ /* front */
+ if (front_len) {
+ if (front_len > PAGE_CACHE_SIZE) {
+ m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+ PAGE_KERNEL);
+ m->front_is_vmalloc = true;
+ } else {
+ m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+ }
+ if (m->front.iov_base == NULL) {
+ pr_err("msg_new can't allocate %d bytes\n",
+ front_len);
+ goto out2;
+ }
+ } else {
+ m->front.iov_base = NULL;
+ }
+ m->front.iov_len = front_len;
+
+ /* middle */
+ m->middle = NULL;
+
+ /* data */
+ m->nr_pages = calc_pages_for(page_off, page_len);
+ m->pages = pages;
+
+ dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+ m->nr_pages);
+ return m;
+
+out2:
+ ceph_msg_put(m);
+out:
+ pr_err("msg_new can't create type %d len %d\n", type, front_len);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr)
+{
+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);
+ struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+
+ if (!msg) {
+ pr_err("unable to allocate msg type %d len %d\n",
+ type, front_len);
+ return ERR_PTR(-ENOMEM);
+ }
+ return msg;
+}
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg. This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int type = le16_to_cpu(msg->hdr.type);
+ int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+ dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+ ceph_msg_type_name(type), middle_len);
+ BUG_ON(!middle_len);
+ BUG_ON(msg->middle);
+
+ msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS);
+ if (!msg->middle)
+ return -ENOMEM;
+ return 0;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+ dout("msg_kfree %p\n", m);
+ if (m->front_is_vmalloc)
+ vfree(m->front.iov_base);
+ else
+ kfree(m->front.iov_base);
+ kfree(m);
+}
+
+/*
+ * Drop a msg ref. Destroy as needed.
+ */
+void ceph_msg_put(struct ceph_msg *m)
+{
+ dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref),
+ atomic_read(&m->nref)-1);
+ if (atomic_read(&m->nref) <= 0) {
+ pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n",
+ m, le64_to_cpu(m->hdr.seq),
+ le16_to_cpu(m->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(m->hdr.type)),
+ le32_to_cpu(m->hdr.front_len),
+ le32_to_cpu(m->hdr.data_len));
+ WARN_ON(1);
+ }
+ if (atomic_dec_and_test(&m->nref)) {
+ dout("ceph_msg_put last one on %p\n", m);
+ WARN_ON(!list_empty(&m->list_head));
+
+ /* drop middle, data, if any */
+ if (m->middle) {
+ ceph_buffer_put(m->middle);
+ m->middle = NULL;
+ }
+ m->nr_pages = 0;
+ m->pages = NULL;
+
+ if (m->pool)
+ ceph_msgpool_put(m->pool, m);
+ else
+ ceph_msg_kfree(m);
+ }
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..f9c9f6487302
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,253 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+ struct ceph_connection *(*get)(struct ceph_connection *);
+ void (*put)(struct ceph_connection *);
+
+ /* handle an incoming message. */
+ void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+ /* authorize an outgoing connection */
+ int (*get_authorizer) (struct ceph_connection *con,
+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new);
+ int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+
+ /* protocol version mismatch */
+ void (*bad_proto) (struct ceph_connection *con);
+
+ /* there was some error on the socket (disconnect, whatever) */
+ void (*fault) (struct ceph_connection *con);
+
+ /* a remote host as terminated a message exchange session, and messages
+ * we sent (or they tried to send us) may be lost. */
+ void (*peer_reset) (struct ceph_connection *con);
+
+ struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+ struct ceph_msg_header *hdr);
+ int (*alloc_middle) (struct ceph_connection *con,
+ struct ceph_msg *msg);
+ /* an incoming message has a data payload; tell me what pages I
+ * should read the data into. */
+ int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m,
+ int want);
+};
+
+extern const char *ceph_name_type_str(int t);
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+ struct ceph_entity_inst inst; /* my name+address */
+ struct ceph_entity_addr my_enc_addr;
+ struct page *zero_page; /* used in certain error cases */
+
+ bool nocrc;
+
+ /*
+ * the global_seq counts connections i (attempt to) initiate
+ * in order to disambiguate certain connect race conditions.
+ */
+ u32 global_seq;
+ spinlock_t global_seq_lock;
+};
+
+/*
+ * a single message. it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+ struct ceph_msg_header hdr; /* header */
+ struct ceph_msg_footer footer; /* footer */
+ struct kvec front; /* unaligned blobs of message */
+ struct ceph_buffer *middle;
+ struct page **pages; /* data payload. NOT OWNER. */
+ unsigned nr_pages; /* size of page array */
+ struct list_head list_head;
+ atomic_t nref;
+ bool front_is_vmalloc;
+ bool more_to_follow;
+ int front_max;
+
+ struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+ int page, page_pos; /* which page; offset in page */
+ int data_pos; /* offset in data payload */
+ int did_page_crc; /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL (HZ/2)
+#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX 0 /* we can close channel or drop messages on errors */
+#define CONNECTING 1
+#define NEGOTIATING 2
+#define KEEPALIVE_PENDING 3
+#define WRITE_PENDING 4 /* we have data ready to send */
+#define QUEUED 5 /* there is work queued on this connection */
+#define BUSY 6 /* work is being done */
+#define STANDBY 8 /* no outgoing messages, socket closed. we keep
+ * the ceph_connection around to maintain shared
+ * state with the peer. */
+#define CLOSED 10 /* we've closed the connection */
+#define SOCK_CLOSED 11 /* socket state changed to closed */
+#define REGISTERED 12 /* connection appears in con_tree */
+#define OPENING 13 /* open connection w/ (possibly new) peer */
+#define DEAD 14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+ void *private;
+ atomic_t nref;
+
+ const struct ceph_connection_operations *ops;
+
+ struct ceph_messenger *msgr;
+ struct socket *sock;
+ unsigned long state; /* connection state (see flags above) */
+ const char *error_msg; /* error message, if any */
+
+ struct ceph_entity_addr peer_addr; /* peer address */
+ struct ceph_entity_name peer_name; /* peer name */
+ struct ceph_entity_addr peer_addr_for_me;
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this connection, client */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+
+ int auth_retry; /* true if we need a newer authorizer */
+ void *auth_reply_buf; /* where to put the authorizer reply */
+ int auth_reply_buf_len;
+
+ /* out queue */
+ struct mutex out_mutex;
+ struct list_head out_queue;
+ struct list_head out_sent; /* sending or sent but unacked */
+ u64 out_seq; /* last message queued for send */
+ u64 out_seq_sent; /* last message sent */
+ bool out_keepalive_pending;
+
+ u64 in_seq, in_seq_acked; /* last message received, acked */
+
+ /* connection negotiation temps */
+ char in_banner[CEPH_BANNER_MAX_LEN];
+ union {
+ struct { /* outgoing connection */
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+ };
+ struct { /* incoming */
+ struct ceph_msg_connect in_connect;
+ struct ceph_msg_connect_reply out_reply;
+ };
+ };
+ struct ceph_entity_addr actual_peer_addr;
+
+ /* message out temps */
+ struct ceph_msg *out_msg; /* sending message (== tail of
+ out_sent) */
+ struct ceph_msg_pos out_msg_pos;
+
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_kvec_is_msg; /* kvec refers to out_msg */
+ int out_more; /* there is more data after the kvecs */
+ __le64 out_temp_ack; /* for writing an ack */
+
+ /* message in temps */
+ struct ceph_msg_header in_hdr;
+ struct ceph_msg *in_msg;
+ struct ceph_msg_pos in_msg_pos;
+ u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
+
+ char in_tag; /* protocol control byte */
+ int in_base_pos; /* bytes read */
+ __le64 in_temp_ack; /* for reading an ack */
+
+ struct delayed_work work; /* send|recv work */
+ unsigned long delay; /* current delay interval */
+};
+
+
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+ struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+ struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+ struct ceph_entity_addr *addr);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+ int page_len, int page_off,
+ struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+extern struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr);
+extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+ dout("ceph_msg_get %p %d -> %d\n", msg, atomic_read(&msg->nref),
+ atomic_read(&msg->nref)+1);
+ atomic_inc(&msg->nref);
+ return msg;
+}
+extern void ceph_msg_put(struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..9ff2da69d33a
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,751 @@
+#include "ceph_debug.h"
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+
+/*
+ * Interact with Ceph monitor cluster. Handle requests for new map
+ * versions, and periodically resend as needed. Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information. An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates. We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure. If the connection does break, we
+ * randomly hunt for a new monitor. Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+const static struct ceph_connection_operations mon_con_ops;
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+ struct ceph_monmap *m = NULL;
+ int i, err = -EINVAL;
+ struct ceph_fsid fsid;
+ u32 epoch, num_mon;
+ u16 version;
+ u32 len;
+
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+
+ dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+ ceph_decode_16_safe(&p, end, version, bad);
+
+ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(&p);
+
+ num_mon = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+ if (num_mon >= CEPH_MAX_MON)
+ goto bad;
+ m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+ if (m == NULL)
+ return ERR_PTR(-ENOMEM);
+ m->fsid = fsid;
+ m->epoch = epoch;
+ m->num_mon = num_mon;
+ ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+ for (i = 0; i < num_mon; i++)
+ ceph_decode_addr(&m->mon_inst[i].addr);
+
+ dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+ m->num_mon);
+ for (i = 0; i < m->num_mon; i++)
+ dout("monmap_decode mon%d is %s\n", i,
+ pr_addr(&m->mon_inst[i].addr.in_addr));
+ return m;
+
+bad:
+ dout("monmap_decode failed with %d\n", err);
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+ int i;
+
+ for (i = 0; i < m->num_mon; i++)
+ if (ceph_entity_addr_equal(addr, &m->mon_inst[i].addr))
+ return 1;
+ return 0;
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+ if (monc->con) {
+ dout("__close_session closing mon%d\n", monc->cur_mon);
+ ceph_con_revoke(monc->con, monc->m_auth);
+ ceph_con_close(monc->con);
+ monc->cur_mon = -1;
+ ceph_auth_reset(monc->auth);
+ }
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+ char r;
+ int ret;
+
+ if (monc->cur_mon < 0) {
+ get_random_bytes(&r, 1);
+ monc->cur_mon = r % monc->monmap->num_mon;
+ dout("open_session num=%d r=%d -> mon%d\n",
+ monc->monmap->num_mon, r, monc->cur_mon);
+ monc->sub_sent = 0;
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+ dout("open_session mon%d opening\n", monc->cur_mon);
+ monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+ monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+ ceph_con_open(monc->con,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /* initiatiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ monc->m_auth->front.iov_len = ret;
+ monc->m_auth->hdr.front_len = cpu_to_le32(ret);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(monc->con, monc->m_auth);
+ } else {
+ dout("open_session mon%d already open\n", monc->cur_mon);
+ }
+ return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+ return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+ unsigned delay;
+
+ if (monc->cur_mon < 0 || __sub_expired(monc))
+ delay = 10 * HZ;
+ else
+ delay = 20 * HZ;
+ dout("__schedule_delayed after %u\n", delay);
+ schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+ dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+ (unsigned)monc->sub_sent, __sub_expired(monc),
+ monc->want_next_osdmap);
+ if ((__sub_expired(monc) && !monc->sub_sent) ||
+ monc->want_next_osdmap == 1) {
+ struct ceph_msg *msg;
+ struct ceph_mon_subscribe_item *i;
+ void *p, *end;
+
+ msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+ if (!msg)
+ return;
+
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ dout("__send_subscribe to 'mdsmap' %u+\n",
+ (unsigned)monc->have_mdsmap);
+ if (monc->want_next_osdmap) {
+ dout("__send_subscribe to 'osdmap' %u\n",
+ (unsigned)monc->have_osdmap);
+ ceph_encode_32(&p, 3);
+ ceph_encode_string(&p, end, "osdmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_osdmap);
+ i->onetime = 1;
+ p += sizeof(*i);
+ monc->want_next_osdmap = 2; /* requested */
+ } else {
+ ceph_encode_32(&p, 2);
+ }
+ ceph_encode_string(&p, end, "mdsmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_mdsmap);
+ i->onetime = 0;
+ p += sizeof(*i);
+ ceph_encode_string(&p, end, "monmap", 6);
+ i = p;
+ i->have = 0;
+ i->onetime = 0;
+ p += sizeof(*i);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_con_send(monc->con, msg);
+
+ monc->sub_sent = jiffies | 1; /* never 0 */
+ }
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ unsigned seconds;
+ struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ seconds = le32_to_cpu(h->duration);
+
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ pr_info("mon%d %s session established\n",
+ monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+ monc->hunting = false;
+ }
+ dout("handle_subscribe_ack after %d seconds\n", seconds);
+ monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+ monc->sub_sent = 0;
+ mutex_unlock(&monc->mutex);
+ return;
+bad:
+ pr_err("got corrupt subscribe-ack msg\n");
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_mdsmap = got;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_osdmap = got;
+ monc->want_next_osdmap = 0;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+ dout("request_next_osdmap have %u\n", monc->have_osdmap);
+ mutex_lock(&monc->mutex);
+ if (!monc->want_next_osdmap)
+ monc->want_next_osdmap = 1;
+ if (monc->want_next_osdmap < 2)
+ __send_subscribe(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+ if (!monc->con) {
+ monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+ if (!monc->con)
+ return -ENOMEM;
+ ceph_con_init(monc->client->msgr, monc->con);
+ monc->con->private = monc;
+ monc->con->ops = &mon_con_ops;
+ }
+
+ mutex_lock(&monc->mutex);
+ __open_session(monc);
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+/*
+ * The monitor responds with mount ack indicate mount success. The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_client *client = monc->client;
+ struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ void *p, *end;
+
+ mutex_lock(&monc->mutex);
+
+ dout("handle_monmap\n");
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ monmap = ceph_monmap_decode(p, end);
+ if (IS_ERR(monmap)) {
+ pr_err("problem decoding monmap, %d\n",
+ (int)PTR_ERR(monmap));
+ return;
+ }
+
+ if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ kfree(monmap);
+ return;
+ }
+
+ client->monc.monmap = monmap;
+ kfree(old);
+
+ mutex_unlock(&monc->mutex);
+ wake_up(&client->mount_wq);
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_statfs_request *req;
+ struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+ u64 tid;
+
+ if (msg->front.iov_len != sizeof(*reply))
+ goto bad;
+ tid = le64_to_cpu(reply->tid);
+ dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+ mutex_lock(&monc->mutex);
+ req = radix_tree_lookup(&monc->statfs_request_tree, tid);
+ if (req) {
+ *req->buf = reply->st;
+ req->result = 0;
+ }
+ mutex_unlock(&monc->mutex);
+ if (req)
+ complete(&req->completion);
+ return;
+
+bad:
+ pr_err("corrupt statfs reply, no tid\n");
+}
+
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+ struct ceph_mon_statfs_request *req)
+{
+ struct ceph_msg *msg;
+ struct ceph_mon_statfs *h;
+
+ dout("send_statfs tid %llu\n", req->tid);
+ msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ req->request = msg;
+ h = msg->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ h->tid = cpu_to_le64(req->tid);
+ ceph_con_send(monc->con, msg);
+ return 0;
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+ struct ceph_mon_statfs_request req;
+ int err;
+
+ req.buf = buf;
+ init_completion(&req.completion);
+
+ /* allocate memory for reply */
+ err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+ if (err)
+ return err;
+
+ /* register request */
+ mutex_lock(&monc->mutex);
+ req.tid = ++monc->last_tid;
+ req.last_attempt = jiffies;
+ req.delay = BASE_DELAY_INTERVAL;
+ if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
+ mutex_unlock(&monc->mutex);
+ pr_err("ENOMEM in do_statfs\n");
+ return -ENOMEM;
+ }
+ monc->num_statfs_requests++;
+ mutex_unlock(&monc->mutex);
+
+ /* send request and wait */
+ err = send_statfs(monc, &req);
+ if (!err)
+ err = wait_for_completion_interruptible(&req.completion);
+
+ mutex_lock(&monc->mutex);
+ radix_tree_delete(&monc->statfs_request_tree, req.tid);
+ monc->num_statfs_requests--;
+ ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+ mutex_unlock(&monc->mutex);
+
+ if (!err)
+ err = req.result;
+ return err;
+}
+
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+ u64 next_tid = 0;
+ int got;
+ int did = 0;
+ struct ceph_mon_statfs_request *req;
+
+ while (1) {
+ got = radix_tree_gang_lookup(&monc->statfs_request_tree,
+ (void **)&req,
+ next_tid, 1);
+ if (got == 0)
+ break;
+ did++;
+ next_tid = req->tid + 1;
+
+ send_statfs(monc, req);
+ }
+}
+
+/*
+ * Delayed work. If we haven't mounted yet, retry. Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM). And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+ struct ceph_mon_client *monc =
+ container_of(work, struct ceph_mon_client, delayed_work.work);
+
+ dout("monc delayed_work\n");
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ __close_session(monc);
+ __open_session(monc); /* continue hunting */
+ } else {
+ ceph_con_keepalive(monc->con);
+ if (monc->auth->ops->is_authenticated(monc->auth))
+ __send_subscribe(monc);
+ }
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+ struct ceph_mount_args *args = monc->client->mount_args;
+ struct ceph_entity_addr *mon_addr = args->mon_addr;
+ int num_mon = args->num_mon;
+ int i;
+
+ /* build initial monmap */
+ monc->monmap = kzalloc(sizeof(*monc->monmap) +
+ num_mon*sizeof(monc->monmap->mon_inst[0]),
+ GFP_KERNEL);
+ if (!monc->monmap)
+ return -ENOMEM;
+ for (i = 0; i < num_mon; i++) {
+ monc->monmap->mon_inst[i].addr = mon_addr[i];
+ monc->monmap->mon_inst[i].addr.erank = 0;
+ monc->monmap->mon_inst[i].addr.nonce = 0;
+ monc->monmap->mon_inst[i].name.type =
+ CEPH_ENTITY_TYPE_MON;
+ monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ }
+ monc->monmap->num_mon = num_mon;
+ monc->have_fsid = false;
+
+ /* release addr memory */
+ kfree(args->mon_addr);
+ args->mon_addr = NULL;
+ args->num_mon = 0;
+ return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+ int err = 0;
+
+ dout("init\n");
+ memset(monc, 0, sizeof(*monc));
+ monc->client = cl;
+ monc->monmap = NULL;
+ mutex_init(&monc->mutex);
+
+ err = build_initial_monmap(monc);
+ if (err)
+ goto out;
+
+ monc->con = NULL;
+
+ /* authentication */
+ monc->auth = ceph_auth_init(cl->mount_args->name,
+ cl->mount_args->secret);
+ if (IS_ERR(monc->auth))
+ return PTR_ERR(monc->auth);
+ monc->auth->want_keys =
+ CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+ /* msg pools */
+ err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+ sizeof(struct ceph_mon_subscribe_ack), 1, false);
+ if (err < 0)
+ goto out_monmap;
+ err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+ sizeof(struct ceph_mon_statfs_reply), 0, false);
+ if (err < 0)
+ goto out_pool1;
+ err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+ if (err < 0)
+ goto out_pool2;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+ if (IS_ERR(monc->m_auth)) {
+ err = PTR_ERR(monc->m_auth);
+ monc->m_auth = NULL;
+ goto out_pool3;
+ }
+
+ monc->cur_mon = -1;
+ monc->hunting = true;
+ monc->sub_renew_after = jiffies;
+ monc->sub_sent = 0;
+
+ INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+ INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_NOFS);
+ monc->num_statfs_requests = 0;
+ monc->last_tid = 0;
+
+ monc->have_mdsmap = 0;
+ monc->have_osdmap = 0;
+ monc->want_next_osdmap = 1;
+ return 0;
+
+out_pool3:
+ ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+ ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+ ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+ kfree(monc->monmap);
+out:
+ return err;
+}
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+ dout("stop\n");
+ cancel_delayed_work_sync(&monc->delayed_work);
+
+ mutex_lock(&monc->mutex);
+ __close_session(monc);
+ if (monc->con) {
+ monc->con->private = NULL;
+ monc->con->ops->put(monc->con);
+ monc->con = NULL;
+ }
+ mutex_unlock(&monc->mutex);
+
+ ceph_auth_destroy(monc->auth);
+
+ ceph_msg_put(monc->m_auth);
+ ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+ ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+ ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+
+ kfree(monc->monmap);
+}
+
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ if (ret < 0) {
+ monc->client->mount_err = ret;
+ wake_up(&monc->client->mount_wq);
+ } else if (ret > 0) {
+ monc->m_auth->front.iov_len = ret;
+ monc->m_auth->hdr.front_len = cpu_to_le32(ret);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(monc->con, monc->m_auth);
+ } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+ dout("authenticated, starting session\n");
+
+ monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr->inst.name.num = monc->auth->global_id;
+
+ __send_subscribe(monc);
+ __resend_statfs(monc);
+ }
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!monc)
+ return;
+
+ switch (type) {
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_STATFS_REPLY:
+ handle_statfs_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_MAP:
+ ceph_monc_handle_map(monc, msg);
+ break;
+
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+ break;
+
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(&monc->client->osdc, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front = le32_to_cpu(hdr->front_len);
+
+ switch (type) {
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front);
+ case CEPH_MSG_STATFS_REPLY:
+ return ceph_msgpool_get(&monc->msgpool_statfs_reply, front);
+ case CEPH_MSG_AUTH_REPLY:
+ return ceph_msgpool_get(&monc->msgpool_auth_reply, front);
+ }
+ return ceph_alloc_msg(con, hdr);
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+ struct ceph_mon_client *monc = con->private;
+
+ if (!monc)
+ return;
+
+ dout("mon_fault\n");
+ mutex_lock(&monc->mutex);
+ if (!con->private)
+ goto out;
+
+ if (monc->con && !monc->hunting)
+ pr_info("mon%d %s session lost, "
+ "hunting for new mon\n", monc->cur_mon,
+ pr_addr(&monc->con->peer_addr.in_addr));
+
+ __close_session(monc);
+ if (!monc->hunting) {
+ /* start hunting */
+ monc->hunting = true;
+ __open_session(monc);
+ } else {
+ /* already hunting, let's wait a bit */
+ __schedule_delayed(monc);
+ }
+out:
+ mutex_unlock(&monc->mutex);
+}
+
+const static struct ceph_connection_operations mon_con_ops = {
+ .get = ceph_con_get,
+ .put = ceph_con_put,
+ .dispatch = dispatch,
+ .fault = mon_fault,
+ .alloc_msg = mon_alloc_msg,
+ .alloc_middle = ceph_alloc_middle,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..c75b53302ecc
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,115 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include "messenger.h"
+#include "msgpool.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 num_mon;
+ struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+ int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+ struct ceph_mon_client *monc;
+ struct delayed_work delayed_work;
+ unsigned long delay;
+ ceph_monc_request_func_t do_request;
+};
+
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+ u64 tid;
+ int result;
+ struct ceph_statfs *buf;
+ struct completion completion;
+ unsigned long last_attempt, delay; /* jiffies */
+ struct ceph_msg *request; /* original request */
+};
+
+struct ceph_mon_client {
+ struct ceph_client *client;
+ struct ceph_monmap *monmap;
+
+ struct mutex mutex;
+ struct delayed_work delayed_work;
+
+ struct ceph_auth_client *auth;
+ struct ceph_msg *m_auth;
+
+ bool hunting;
+ int cur_mon; /* last monitor i contacted */
+ unsigned long sub_sent, sub_renew_after;
+ struct ceph_connection *con;
+ bool have_fsid;
+
+ /* msg pools */
+ struct ceph_msgpool msgpool_subscribe_ack;
+ struct ceph_msgpool msgpool_statfs_reply;
+ struct ceph_msgpool msgpool_auth_reply;
+
+ /* pending statfs requests */
+ struct radix_tree_root statfs_request_tree;
+ int num_statfs_requests;
+ u64 last_tid;
+
+ /* mds/osd map */
+ int want_next_osdmap; /* 1 = want, 2 = want+asked */
+ u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+ struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map. We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+ struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+
+
+#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..7599b3382076
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,181 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times. We take use a few different
+ * strategies:
+ *
+ * - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ * - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ * - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector. This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+
+
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+ struct ceph_msg *msg;
+
+ while (pool->num < pool->min) {
+ dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+ pool->min);
+ spin_unlock(&pool->lock);
+ msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+ spin_lock(&pool->lock);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ msg->pool = pool;
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ }
+ while (pool->num > pool->min) {
+ msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+ dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+ pool->min, msg);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ ceph_msg_kfree(msg);
+ }
+ return 0;
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int min, bool blocking)
+{
+ int ret;
+
+ dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+ spin_lock_init(&pool->lock);
+ pool->front_len = front_len;
+ INIT_LIST_HEAD(&pool->msgs);
+ pool->num = 0;
+ pool->min = min;
+ pool->blocking = blocking;
+ init_waitqueue_head(&pool->wait);
+
+ spin_lock(&pool->lock);
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+ return ret;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ dout("msgpool_destroy %p\n", pool);
+ spin_lock(&pool->lock);
+ pool->min = 0;
+ __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+}
+
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+ int ret;
+
+ spin_lock(&pool->lock);
+ dout("msgpool_resv %p delta %d\n", pool, delta);
+ pool->min += delta;
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+ return ret;
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+ wait_queue_t wait;
+ struct ceph_msg *msg;
+
+ if (front_len && front_len > pool->front_len) {
+ pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+ pool, front_len, pool->front_len);
+ WARN_ON(1);
+
+ /* try to alloc a fresh message */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+ }
+
+ if (!front_len)
+ front_len = pool->front_len;
+
+ if (pool->blocking) {
+ /* mempool_t behavior; first try to alloc */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+ }
+
+ while (1) {
+ spin_lock(&pool->lock);
+ if (likely(pool->num)) {
+ msg = list_entry(pool->msgs.next, struct ceph_msg,
+ list_head);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ return msg;
+ }
+ pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+ pool->min, pool->blocking ? "waiting" : "failing");
+ spin_unlock(&pool->lock);
+
+ if (!pool->blocking) {
+ WARN_ON(1);
+
+ /* maybe we can allocate it now? */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+
+ return ERR_PTR(-ENOMEM);
+ }
+
+ init_wait(&wait);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&pool->wait, &wait);
+ }
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ spin_lock(&pool->lock);
+ if (pool->num < pool->min) {
+ ceph_msg_get(msg); /* retake a single ref */
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ wake_up(&pool->wait);
+ } else {
+ dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ ceph_msg_kfree(msg);
+ }
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+ spinlock_t lock;
+ int front_len; /* preallocated payload size */
+ struct list_head msgs; /* msgs in the pool; each has 1 ref */
+ int num, min; /* cur, min # msgs in the pool */
+ bool blocking;
+ wait_queue_head_t wait;
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+ int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..c758e8f8f71b
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,167 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT 6789 /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST 6789
+#define CEPH_PORT_START 6800 /* non-monitors start here */
+#define CEPH_PORT_LAST 6900
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v024"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN 0x10
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 erank; /* entity's rank in process */
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
+ const struct ceph_entity_addr *b)
+{
+ return a->nonce == b->nonce &&
+ memcmp(&a->in_addr, &b->in_addr, sizeof(a->in_addr)) == 0;
+}
+
+static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
+ const struct ceph_entity_addr *b)
+{
+ return memcmp(a, b, sizeof(*a)) == 0;
+}
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 dst_erank;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+
+
+#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..5d30d5959b97
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1363 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+
+const static struct ceph_connection_operations osd_con_ops;
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices." (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly. shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ struct ceph_osd_op *op = (void *)(reqhead + 1);
+ u64 orig_len = *plen;
+ u64 objoff, objlen; /* extent in object */
+ u64 bno;
+
+ reqhead->snapid = cpu_to_le64(vino.snap);
+
+ /* object extent? */
+ ceph_calc_file_object_mapping(layout, off, plen, &bno,
+ &objoff, &objlen);
+ if (*plen < orig_len)
+ dout(" skipping last %llu, final file extent %llu~%llu\n",
+ orig_len - *plen, off, *plen);
+
+ sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+ req->r_oid_len = strlen(req->r_oid);
+
+ op->extent.offset = cpu_to_le64(objoff);
+ op->extent.length = cpu_to_le64(objlen);
+ req->r_num_pages = calc_pages_for(off, *plen);
+
+ dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+ req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+
+
+/*
+ * requests
+ */
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+ dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
+ atomic_read(&req->r_ref)-1);
+ BUG_ON(atomic_read(&req->r_ref) <= 0);
+ if (atomic_dec_and_test(&req->r_ref)) {
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply)
+ ceph_msg_put(req->r_reply);
+ if (req->r_own_pages)
+ ceph_release_page_vector(req->r_pages,
+ req->r_num_pages);
+ ceph_put_snap_context(req->r_snapc);
+ if (req->r_mempool)
+ mempool_free(req, req->r_osdc->req_mempool);
+ else
+ kfree(req);
+ }
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately. (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 off, u64 *plen,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ int do_sync,
+ u32 truncate_seq,
+ u64 truncate_size,
+ struct timespec *mtime,
+ bool use_mempool, int num_reply)
+{
+ struct ceph_osd_request *req;
+ struct ceph_msg *msg;
+ struct ceph_osd_request_head *head;
+ struct ceph_osd_op *op;
+ void *p;
+ int do_trunc = truncate_seq && (off + *plen > truncate_size);
+ int num_op = 1 + do_sync + do_trunc;
+ size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+ int err, i;
+ u64 prevofs;
+
+ if (use_mempool) {
+ req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+ memset(req, 0, sizeof(*req));
+ } else {
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ }
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+ if (err) {
+ ceph_osdc_put_request(req);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ req->r_osdc = osdc;
+ req->r_mempool = use_mempool;
+ atomic_set(&req->r_ref, 1);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_flags = flags;
+
+ WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+ /* create message; allow space for oid */
+ msg_size += 40;
+ if (snapc)
+ msg_size += sizeof(u64) * snapc->num_snaps;
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+ if (IS_ERR(msg)) {
+ ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+ ceph_osdc_put_request(req);
+ return ERR_PTR(PTR_ERR(msg));
+ }
+ msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+ memset(msg->front.iov_base, 0, msg->front.iov_len);
+ head = msg->front.iov_base;
+ op = (void *)(head + 1);
+ p = (void *)(op + num_op);
+
+ req->r_request = msg;
+ req->r_snapc = ceph_get_snap_context(snapc);
+
+ head->client_inc = cpu_to_le32(1); /* always, for now. */
+ head->flags = cpu_to_le32(flags);
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ ceph_encode_timespec(&head->mtime, mtime);
+ head->num_ops = cpu_to_le16(num_op);
+ op->op = cpu_to_le16(opcode);
+
+ /* calculate max write size */
+ calc_layout(osdc, vino, layout, off, plen, req);
+ req->r_file_layout = *layout; /* keep a copy */
+
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ req->r_request->hdr.data_off = cpu_to_le16(off);
+ req->r_request->hdr.data_len = cpu_to_le32(*plen);
+ op->payload_len = cpu_to_le32(*plen);
+ }
+
+ /* fill in oid */
+ head->object_len = cpu_to_le32(req->r_oid_len);
+ memcpy(p, req->r_oid, req->r_oid_len);
+ p += req->r_oid_len;
+
+ /* additional ops */
+ if (do_trunc) {
+ op++;
+ op->op = cpu_to_le16(opcode == CEPH_OSD_OP_READ ?
+ CEPH_OSD_OP_MASKTRUNC : CEPH_OSD_OP_SETTRUNC);
+ op->trunc.truncate_seq = cpu_to_le32(truncate_seq);
+ prevofs = le64_to_cpu((op-1)->extent.offset);
+ op->trunc.truncate_size = cpu_to_le64(truncate_size -
+ (off-prevofs));
+ }
+ if (do_sync) {
+ op++;
+ op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+ }
+ if (snapc) {
+ head->snap_seq = cpu_to_le64(snapc->seq);
+ head->num_snaps = cpu_to_le32(snapc->num_snaps);
+ for (i = 0; i < snapc->num_snaps; i++) {
+ put_unaligned_le64(snapc->snaps[i], p);
+ p += sizeof(u64);
+ }
+ }
+
+ BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+ return req;
+}
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *new)
+{
+ struct rb_node **p = &osdc->requests.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_osd_request, r_node);
+ if (new->r_tid < req->r_tid)
+ p = &(*p)->rb_left;
+ else if (new->r_tid > req->r_tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid)
+ n = n->rb_left;
+ else if (tid > req->r_tid)
+ n = n->rb_right;
+ else
+ return req;
+ }
+ return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid) {
+ if (!n->rb_left)
+ return req;
+ n = n->rb_left;
+ } else if (tid > req->r_tid) {
+ n = n->rb_right;
+ } else {
+ return req;
+ }
+ }
+ return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+
+ if (!osd)
+ return;
+ dout("osd_reset osd%d\n", osd->o_osd);
+ osdc = osd->o_osdc;
+ osd->o_incarnation++;
+ down_read(&osdc->map_sem);
+ kick_requests(osdc, osd);
+ up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd *osd;
+
+ osd = kzalloc(sizeof(*osd), GFP_NOFS);
+ if (!osd)
+ return NULL;
+
+ atomic_set(&osd->o_ref, 1);
+ osd->o_osdc = osdc;
+ INIT_LIST_HEAD(&osd->o_requests);
+ osd->o_incarnation = 1;
+
+ ceph_con_init(osdc->client->msgr, &osd->o_con);
+ osd->o_con.private = osd;
+ osd->o_con.ops = &osd_con_ops;
+ osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+ return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+ if (atomic_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+ atomic_read(&osd->o_ref));
+ return osd;
+ } else {
+ dout("get_osd %p FAIL\n", osd);
+ return NULL;
+ }
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+ dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+ atomic_read(&osd->o_ref) - 1);
+ if (atomic_dec_and_test(&osd->o_ref))
+ kfree(osd);
+}
+
+/*
+ * remove an osd from our map
+ */
+static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("remove_osd %p\n", osd);
+ BUG_ON(!list_empty(&osd->o_requests));
+ rb_erase(&osd->o_node, &osdc->osds);
+ ceph_con_close(&osd->o_con);
+ put_osd(osd);
+}
+
+/*
+ * reset osd connect
+ */
+static int reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ int ret = 0;
+
+ dout("reset_osd %p osd%d\n", osd, osd->o_osd);
+ if (list_empty(&osd->o_requests)) {
+ remove_osd(osdc, osd);
+ } else {
+ ceph_con_close(&osd->o_con);
+ ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+ osd->o_incarnation++;
+ }
+ return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+ struct rb_node **p = &osdc->osds.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd *osd = NULL;
+
+ while (*p) {
+ parent = *p;
+ osd = rb_entry(parent, struct ceph_osd, o_node);
+ if (new->o_osd < osd->o_osd)
+ p = &(*p)->rb_left;
+ else if (new->o_osd > osd->o_osd)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->o_node, parent, p);
+ rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+ struct ceph_osd *osd;
+ struct rb_node *n = osdc->osds.rb_node;
+
+ while (n) {
+ osd = rb_entry(n, struct ceph_osd, o_node);
+ if (o < osd->o_osd)
+ n = n->rb_left;
+ else if (o > osd->o_osd)
+ n = n->rb_right;
+ else
+ return osd;
+ }
+ return NULL;
+}
+
+
+/*
+ * Register request, assign tid. If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *head = req->r_request->front.iov_base;
+
+ mutex_lock(&osdc->request_mutex);
+ req->r_tid = ++osdc->last_tid;
+ head->tid = cpu_to_le64(req->r_tid);
+
+ dout("register_request %p tid %lld\n", req, req->r_tid);
+ __insert_request(osdc, req);
+ ceph_osdc_get_request(req);
+ osdc->num_requests++;
+
+ req->r_timeout_stamp =
+ jiffies + osdc->client->mount_args->osd_timeout*HZ;
+
+ if (osdc->num_requests == 1) {
+ osdc->timeout_tid = req->r_tid;
+ dout(" timeout on tid %llu at %lu\n", req->r_tid,
+ req->r_timeout_stamp);
+ schedule_delayed_work(&osdc->timeout_work,
+ round_jiffies_relative(req->r_timeout_stamp - jiffies));
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ rb_erase(&req->r_node, &osdc->requests);
+ osdc->num_requests--;
+
+ if (req->r_osd) {
+ /* make sure the original request isn't in flight. */
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+ list_del_init(&req->r_osd_item);
+ if (list_empty(&req->r_osd->o_requests))
+ remove_osd(osdc, req->r_osd);
+ req->r_osd = NULL;
+ }
+
+ ceph_osdc_put_request(req);
+
+ if (req->r_tid == osdc->timeout_tid) {
+ if (osdc->num_requests == 0) {
+ dout("no requests, canceling timeout\n");
+ osdc->timeout_tid = 0;
+ cancel_delayed_work(&osdc->timeout_work);
+ } else {
+ req = rb_entry(rb_first(&osdc->requests),
+ struct ceph_osd_request, r_node);
+ osdc->timeout_tid = req->r_tid;
+ dout("rescheduled timeout on tid %llu at %lu\n",
+ req->r_tid, req->r_timeout_stamp);
+ schedule_delayed_work(&osdc->timeout_work,
+ round_jiffies_relative(req->r_timeout_stamp -
+ jiffies));
+ }
+ }
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+ if (req->r_sent) {
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+ req->r_sent = 0;
+ }
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately. If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ struct ceph_pg pgid;
+ int o = -1;
+ int err;
+ struct ceph_osd *newosd = NULL;
+
+ dout("map_osds %p tid %lld\n", req, req->r_tid);
+ err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+ &req->r_file_layout, osdc->osdmap);
+ if (err)
+ return err;
+ pgid = reqhead->layout.ol_pgid;
+ o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+
+ if ((req->r_osd && req->r_osd->o_osd == o &&
+ req->r_sent >= req->r_osd->o_incarnation) ||
+ (req->r_osd == NULL && o == -1))
+ return 0; /* no change */
+
+ dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+ req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ if (req->r_osd) {
+ __cancel_request(req);
+ list_del_init(&req->r_osd_item);
+ if (list_empty(&req->r_osd->o_requests)) {
+ /* try to re-use r_osd if possible */
+ newosd = get_osd(req->r_osd);
+ remove_osd(osdc, newosd);
+ }
+ req->r_osd = NULL;
+ }
+
+ req->r_osd = __lookup_osd(osdc, o);
+ if (!req->r_osd && o >= 0) {
+ if (newosd) {
+ req->r_osd = newosd;
+ newosd = NULL;
+ } else {
+ err = -ENOMEM;
+ req->r_osd = create_osd(osdc);
+ if (!req->r_osd)
+ goto out;
+ }
+
+ dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+ req->r_osd->o_osd = o;
+ req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+ __insert_osd(osdc, req->r_osd);
+
+ ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+ }
+
+ if (req->r_osd)
+ list_add(&req->r_osd_item, &req->r_osd->o_requests);
+ err = 1; /* osd changed */
+
+out:
+ if (newosd)
+ put_osd(newosd);
+ return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *reqhead;
+ int err;
+
+ err = __map_osds(osdc, req);
+ if (err < 0)
+ return err;
+ if (req->r_osd == NULL) {
+ dout("send_request %p no up osds in pg\n", req);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ return 0;
+ }
+
+ dout("send_request %p tid %llu to osd%d flags %d\n",
+ req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+ reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
+ reqhead->reassert_version = req->r_reassert_version;
+
+ req->r_timeout_stamp = jiffies+osdc->client->mount_args->osd_timeout*HZ;
+
+ ceph_msg_get(req->r_request); /* send consumes a ref */
+ ceph_con_send(&req->r_osd->o_con, req->r_request);
+ req->r_sent = req->r_osd->o_incarnation;
+ return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds. When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected. Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_osd_request *req;
+ struct ceph_osd *osd;
+ unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+ unsigned long next_timeout = timeout + jiffies;
+ struct rb_node *p;
+
+ dout("timeout\n");
+ down_read(&osdc->map_sem);
+
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_resend) {
+ int err;
+
+ dout("osdc resending prev failed %lld\n", req->r_tid);
+ err = __send_request(osdc, req);
+ if (err)
+ dout("osdc failed again on %lld\n", req->r_tid);
+ else
+ req->r_resend = false;
+ continue;
+ }
+ }
+ for (p = rb_first(&osdc->osds); p; p = rb_next(p)) {
+ osd = rb_entry(p, struct ceph_osd, o_node);
+ if (list_empty(&osd->o_requests))
+ continue;
+ req = list_first_entry(&osd->o_requests,
+ struct ceph_osd_request, r_osd_item);
+ if (time_before(jiffies, req->r_timeout_stamp))
+ continue;
+
+ dout(" tid %llu (at least) timed out on osd%d\n",
+ req->r_tid, osd->o_osd);
+ req->r_timeout_stamp = next_timeout;
+ ceph_con_keepalive(&osd->o_con);
+ }
+
+ if (osdc->timeout_tid)
+ schedule_delayed_work(&osdc->timeout_work,
+ round_jiffies_relative(timeout));
+
+ mutex_unlock(&osdc->request_mutex);
+
+ up_read(&osdc->map_sem);
+}
+
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+ struct ceph_osd_request *req;
+ u64 tid;
+ int numops, object_len, flags;
+
+ if (msg->front.iov_len < sizeof(*rhead))
+ goto bad;
+ tid = le64_to_cpu(rhead->tid);
+ numops = le32_to_cpu(rhead->num_ops);
+ object_len = le32_to_cpu(rhead->object_len);
+ if (msg->front.iov_len != sizeof(*rhead) + object_len +
+ numops * sizeof(struct ceph_osd_op))
+ goto bad;
+ dout("handle_reply %p tid %llu\n", msg, tid);
+
+ /* lookup */
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (req == NULL) {
+ dout("handle_reply tid %llu dne\n", tid);
+ mutex_unlock(&osdc->request_mutex);
+ return;
+ }
+ ceph_osdc_get_request(req);
+ flags = le32_to_cpu(rhead->flags);
+
+ if (req->r_reply) {
+ /*
+ * once we see the message has been received, we don't
+ * need a ref (which is only needed for revoking
+ * pages)
+ */
+ ceph_msg_put(req->r_reply);
+ req->r_reply = NULL;
+ }
+
+ if (!req->r_got_reply) {
+ unsigned bytes;
+
+ req->r_result = le32_to_cpu(rhead->result);
+ bytes = le32_to_cpu(msg->hdr.data_len);
+ dout("handle_reply result %d bytes %d\n", req->r_result,
+ bytes);
+ if (req->r_result == 0)
+ req->r_result = bytes;
+
+ /* in case this is a write and we need to replay, */
+ req->r_reassert_version = rhead->reassert_version;
+
+ req->r_got_reply = 1;
+ } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+ dout("handle_reply tid %llu dup ack\n", tid);
+ goto done;
+ }
+
+ dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+ /* either this is a read, or we got the safe response */
+ if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+ ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+ __unregister_request(osdc, req);
+
+ mutex_unlock(&osdc->request_mutex);
+
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete(&req->r_completion);
+
+ if (flags & CEPH_OSD_FLAG_ONDISK) {
+ if (req->r_safe_callback)
+ req->r_safe_callback(req, msg);
+ complete(&req->r_safe_completion); /* fsync waiter */
+ }
+
+done:
+ ceph_osdc_put_request(req);
+ return;
+
+bad:
+ pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+ (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+ (int)sizeof(*rhead));
+}
+
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed. Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *p, *n;
+ int needmap = 0;
+ int err;
+
+ dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+ mutex_lock(&osdc->request_mutex);
+ if (!kickosd) {
+ for (p = rb_first(&osdc->osds); p; p = n) {
+ struct ceph_osd *osd =
+ rb_entry(p, struct ceph_osd, o_node);
+
+ n = rb_next(p);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ !ceph_entity_addr_equal(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap,
+ osd->o_osd)))
+ reset_osd(osdc, osd);
+ }
+ }
+
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_resend) {
+ dout(" r_resend set on tid %llu\n", req->r_tid);
+ __cancel_request(req);
+ goto kick;
+ }
+ if (req->r_osd && kickosd == req->r_osd) {
+ __cancel_request(req);
+ goto kick;
+ }
+
+ err = __map_osds(osdc, req);
+ if (err == 0)
+ continue; /* no change */
+ if (err < 0) {
+ /*
+ * FIXME: really, we should set the request
+ * error and fail if this isn't a 'nofail'
+ * request, but that's a fair bit more
+ * complicated to do. So retry!
+ */
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ continue;
+ }
+ if (req->r_osd == NULL) {
+ dout("tid %llu maps to no valid osd\n", req->r_tid);
+ needmap++; /* request a newer map */
+ continue;
+ }
+
+kick:
+ dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+ req->r_osd->o_osd);
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ err = __send_request(osdc, req);
+ if (err) {
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ }
+ }
+ mutex_unlock(&osdc->request_mutex);
+
+ if (needmap) {
+ dout("%d requests for down osds, need new map\n", needmap);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ }
+}
+
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster. Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ void *p, *end, *next;
+ u32 nr_maps, maplen;
+ u32 epoch;
+ struct ceph_osdmap *newmap = NULL, *oldmap;
+ int err;
+ struct ceph_fsid fsid;
+
+ dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ /* verify fsid */
+ ceph_decode_need(&p, end, sizeof(fsid), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(osdc->client, &fsid) < 0)
+ return;
+
+ down_write(&osdc->map_sem);
+
+ /* incremental maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d inc maps\n", nr_maps);
+ while (nr_maps > 0) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ next = p + maplen;
+ if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ dout("applying incremental map %u len %d\n",
+ epoch, maplen);
+ newmap = osdmap_apply_incremental(&p, next,
+ osdc->osdmap,
+ osdc->client->msgr);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ if (newmap != osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
+ }
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
+ }
+ p = next;
+ nr_maps--;
+ }
+ if (newmap)
+ goto done;
+
+ /* full maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d full maps\n", nr_maps);
+ while (nr_maps) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (nr_maps > 1) {
+ dout("skipping non-latest full map %u len %d\n",
+ epoch, maplen);
+ } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ dout("skipping full map %u len %d, "
+ "older than our %u\n", epoch, maplen,
+ osdc->osdmap->epoch);
+ } else {
+ dout("taking full map %u len %d\n", epoch, maplen);
+ newmap = osdmap_decode(&p, p+maplen);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ oldmap = osdc->osdmap;
+ osdc->osdmap = newmap;
+ if (oldmap)
+ ceph_osdmap_destroy(oldmap);
+ }
+ p += maplen;
+ nr_maps--;
+ }
+
+done:
+ downgrade_write(&osdc->map_sem);
+ ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+ if (newmap)
+ kick_requests(osdc, NULL);
+ up_read(&osdc->map_sem);
+ return;
+
+bad:
+ pr_err("osdc handle_map corrupt msg\n");
+ up_write(&osdc->map_sem);
+ return;
+}
+
+
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ * 0 = success, -1 failure.
+ */
+static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
+ int want)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_reply_head *rhead = m->front.iov_base;
+ struct ceph_osd_request *req;
+ u64 tid;
+ int ret = -1;
+ int type = le16_to_cpu(m->hdr.type);
+
+ if (!osd)
+ return -1;
+ osdc = osd->o_osdc;
+
+ dout("prepare_pages on msg %p want %d\n", m, want);
+ if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
+ return -1; /* hmm! */
+
+ tid = le64_to_cpu(rhead->tid);
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (!req) {
+ dout("prepare_pages unknown tid %llu\n", tid);
+ goto out;
+ }
+ dout("prepare_pages tid %llu has %d pages, want %d\n",
+ tid, req->r_num_pages, want);
+ if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) {
+ m->pages = req->r_pages;
+ m->nr_pages = req->r_num_pages;
+ req->r_reply = m; /* only for duration of read over socket */
+ ceph_msg_get(m);
+ req->r_prepared_pages = 1;
+ ret = 0; /* success */
+ }
+out:
+ mutex_unlock(&osdc->request_mutex);
+ return ret;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{
+ int rc = 0;
+
+ req->r_request->pages = req->r_pages;
+ req->r_request->nr_pages = req->r_num_pages;
+
+ register_request(osdc, req);
+
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+ /*
+ * a racing kick_requests() may have sent the message for us
+ * while we dropped request_mutex above, so only send now if
+ * the request still han't been touched yet.
+ */
+ if (req->r_sent == 0) {
+ rc = __send_request(osdc, req);
+ if (rc) {
+ if (nofail) {
+ dout("osdc_start_request failed send, "
+ " marking %lld\n", req->r_tid);
+ req->r_resend = true;
+ rc = 0;
+ } else {
+ __unregister_request(osdc, req);
+ }
+ }
+ }
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ return rc;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ int rc;
+
+ rc = wait_for_completion_interruptible(&req->r_completion);
+ if (rc < 0) {
+ mutex_lock(&osdc->request_mutex);
+ __cancel_request(req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("wait_request tid %llu timed out\n", req->r_tid);
+ return rc;
+ }
+
+ dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+ return req->r_result;
+}
+
+/*
+ * sync - wait for all in-flight requests to flush. avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_request *req;
+ u64 last_tid, next_tid = 0;
+
+ mutex_lock(&osdc->request_mutex);
+ last_tid = osdc->last_tid;
+ while (1) {
+ req = __lookup_request_ge(osdc, next_tid);
+ if (!req)
+ break;
+ if (req->r_tid > last_tid)
+ break;
+
+ next_tid = req->r_tid + 1;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync waiting on tid %llu (last is %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&osdc->request_mutex);
+ ceph_osdc_put_request(req);
+ }
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync done (thru tid %llu)\n", last_tid);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+ int err;
+
+ dout("init\n");
+ osdc->client = client;
+ osdc->osdmap = NULL;
+ init_rwsem(&osdc->map_sem);
+ init_completion(&osdc->map_waiters);
+ osdc->last_requested_map = 0;
+ mutex_init(&osdc->request_mutex);
+ osdc->timeout_tid = 0;
+ osdc->last_tid = 0;
+ osdc->osds = RB_ROOT;
+ osdc->requests = RB_ROOT;
+ osdc->num_requests = 0;
+ INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+
+ err = -ENOMEM;
+ osdc->req_mempool = mempool_create_kmalloc_pool(10,
+ sizeof(struct ceph_osd_request));
+ if (!osdc->req_mempool)
+ goto out;
+
+ err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
+ if (err < 0)
+ goto out_mempool;
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
+ if (err < 0)
+ goto out_msgpool;
+ return 0;
+
+out_msgpool:
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+ mempool_destroy(osdc->req_mempool);
+out:
+ return err;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work_sync(&osdc->timeout_work);
+ if (osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = NULL;
+ }
+ mempool_destroy(osdc->req_mempool);
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0, truncate_seq, truncate_size, NULL,
+ false, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+ req->r_pages = pages;
+ num_pages = calc_pages_for(off, *plen);
+ req->r_num_pages = num_pages;
+
+ dout("readpages final extent is %llu~%llu (%d pages)\n",
+ off, *plen, req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec *mtime,
+ struct page **pages, int num_pages,
+ int flags, int do_sync, bool nofail)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ BUG_ON(vino.snap != CEPH_NOSNAP);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+ CEPH_OSD_OP_WRITE,
+ flags | CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE,
+ snapc, do_sync,
+ truncate_seq, truncate_size, mtime,
+ nofail, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ req->r_pages = pages;
+ req->r_num_pages = calc_pages_for(off, len);
+ dout("writepages %llu~%llu (%d pages)\n", off, len,
+ req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, nofail);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!osd)
+ return;
+ osdc = osd->o_osdc;
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(osdc, msg);
+ break;
+ case CEPH_MSG_OSD_OPREPLY:
+ handle_reply(osdc, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ int type = le16_to_cpu(hdr->type);
+ int front = le32_to_cpu(hdr->front_len);
+
+ switch (type) {
+ case CEPH_MSG_OSD_OPREPLY:
+ return ceph_msgpool_get(&osdc->msgpool_op_reply, front);
+ }
+ return ceph_alloc_msg(con, hdr);
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ if (get_osd(osd))
+ return con;
+ return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+ int ret = 0;
+
+ if (force_new && o->o_authorizer) {
+ ac->ops->destroy_authorizer(ac, o->o_authorizer);
+ o->o_authorizer = NULL;
+ }
+ if (o->o_authorizer == NULL) {
+ ret = ac->ops->create_authorizer(
+ ac, CEPH_ENTITY_TYPE_OSD,
+ &o->o_authorizer,
+ &o->o_authorizer_buf,
+ &o->o_authorizer_buf_len,
+ &o->o_authorizer_reply_buf,
+ &o->o_authorizer_reply_buf_len);
+ if (ret)
+ return ret;
+ }
+
+ *proto = ac->protocol;
+ *buf = o->o_authorizer_buf;
+ *len = o->o_authorizer_buf_len;
+ *reply_buf = o->o_authorizer_reply_buf;
+ *reply_len = o->o_authorizer_reply_buf_len;
+ return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+
+const static struct ceph_connection_operations osd_con_ops = {
+ .get = get_osd_con,
+ .put = put_osd_con,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .alloc_msg = alloc_msg,
+ .fault = osd_reset,
+ .alloc_middle = ceph_alloc_middle,
+ .prepare_pages = prepare_pages,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..3d4ae6595aaa
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,150 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+ struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+ atomic_t o_ref;
+ struct ceph_osd_client *o_osdc;
+ int o_osd;
+ int o_incarnation;
+ struct rb_node o_node;
+ struct ceph_connection o_con;
+ struct list_head o_requests;
+ struct ceph_authorizer *o_authorizer;
+ void *o_authorizer_buf, *o_authorizer_reply_buf;
+ size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+ u64 r_tid; /* unique for this client */
+ struct rb_node r_node;
+ struct list_head r_osd_item;
+ struct ceph_osd *r_osd;
+
+ struct ceph_msg *r_request, *r_reply;
+ int r_result;
+ int r_flags; /* any additional flags for the osd */
+ u32 r_sent; /* >0 if r_request is sending/sent */
+ int r_prepared_pages, r_got_reply;
+
+ struct ceph_osd_client *r_osdc;
+ atomic_t r_ref;
+ bool r_mempool;
+ struct completion r_completion, r_safe_completion;
+ ceph_osdc_callback_t r_callback, r_safe_callback;
+ struct ceph_eversion r_reassert_version;
+ struct list_head r_unsafe_item;
+
+ struct inode *r_inode; /* for use by callbacks */
+ struct writeback_control *r_wbc; /* ditto */
+
+ char r_oid[40]; /* object name */
+ int r_oid_len;
+ unsigned long r_timeout_stamp;
+ bool r_resend; /* msg send failed, needs retry */
+
+ struct ceph_file_layout r_file_layout;
+ struct ceph_snap_context *r_snapc; /* snap context for writes */
+ unsigned r_num_pages; /* size of page array (follows) */
+ struct page **r_pages; /* pages for data payload */
+ int r_pages_from_pool;
+ int r_own_pages; /* if true, i own page list */
+};
+
+struct ceph_osd_client {
+ struct ceph_client *client;
+
+ struct ceph_osdmap *osdmap; /* current map */
+ struct rw_semaphore map_sem;
+ struct completion map_waiters;
+ u64 last_requested_map;
+
+ struct mutex request_mutex;
+ struct rb_root osds; /* osds */
+ u64 timeout_tid; /* tid of timeout triggering rq */
+ u64 last_tid; /* tid of last request */
+ struct rb_root requests; /* pending requests */
+ int num_requests;
+ struct delayed_work timeout_work;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+
+ mempool_t *req_mempool;
+
+ struct ceph_msgpool msgpool_op;
+ struct ceph_msgpool msgpool_op_reply;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+ struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 offset, u64 *len, int op, int flags,
+ struct ceph_snap_context *snapc,
+ int do_sync, u32 truncate_seq,
+ u64 truncate_size,
+ struct timespec *mtime,
+ bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+ atomic_inc(&req->r_ref);
+}
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *sc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec *mtime,
+ struct page **pages, int nr_pages,
+ int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..8c994c714781
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,916 @@
+
+#include <asm/div64.h>
+
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+#include "ceph_debug.h"
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+ int flag = 0;
+
+ if (!len)
+ goto done;
+
+ *str = '\0';
+ if (state) {
+ if (state & CEPH_OSD_EXISTS) {
+ snprintf(str, len, "exists");
+ flag = 1;
+ }
+ if (state & CEPH_OSD_UP) {
+ snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+ "up");
+ flag = 1;
+ }
+ } else {
+ snprintf(str, len, "doesn't exist");
+ }
+done:
+ return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+ int b = 0;
+ while (t) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+ pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+ pi->pgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+ pi->lpg_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+ pi->lpgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+ struct crush_bucket_uniform *b)
+{
+ dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+ ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+ b->item_weight = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+ struct crush_bucket_list *b)
+{
+ int j;
+ dout("crush_decode_list_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->sum_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->sum_weights[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+ struct crush_bucket_tree *b)
+{
+ int j;
+ dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+ ceph_decode_32_safe(p, end, b->num_nodes, bad);
+ b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+ if (b->node_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+ for (j = 0; j < b->num_nodes; j++)
+ b->node_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+ struct crush_bucket_straw *b)
+{
+ int j;
+ dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->straws == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->straws[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+ struct crush_map *c;
+ int err = -EINVAL;
+ int i, j;
+ void **p = &pbyval;
+ void *start = pbyval;
+ u32 magic;
+
+ dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ c = kzalloc(sizeof(*c), GFP_NOFS);
+ if (c == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ magic = ceph_decode_32(p);
+ if (magic != CRUSH_MAGIC) {
+ pr_err("crush_decode magic %x != current %x\n",
+ (unsigned)magic, (unsigned)CRUSH_MAGIC);
+ goto bad;
+ }
+ c->max_buckets = ceph_decode_32(p);
+ c->max_rules = ceph_decode_32(p);
+ c->max_devices = ceph_decode_32(p);
+
+ c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+ if (c->device_parents == NULL)
+ goto badmem;
+ c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+ if (c->bucket_parents == NULL)
+ goto badmem;
+
+ c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+ if (c->buckets == NULL)
+ goto badmem;
+ c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+ if (c->rules == NULL)
+ goto badmem;
+
+ /* buckets */
+ for (i = 0; i < c->max_buckets; i++) {
+ int size = 0;
+ u32 alg;
+ struct crush_bucket *b;
+
+ ceph_decode_32_safe(p, end, alg, bad);
+ if (alg == 0) {
+ c->buckets[i] = NULL;
+ continue;
+ }
+ dout("crush_decode bucket %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(struct crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(struct crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(struct crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(struct crush_bucket_straw);
+ break;
+ default:
+ goto bad;
+ }
+ BUG_ON(size == 0);
+ b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+ if (b == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ b->id = ceph_decode_32(p);
+ b->type = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
+ b->weight = ceph_decode_32(p);
+ b->size = ceph_decode_32(p);
+
+ dout("crush_decode bucket size %d off %x %p to %p\n",
+ b->size, (int)(*p-start), *p, end);
+
+ b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+ if (b->items == NULL)
+ goto badmem;
+ b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+ if (b->perm == NULL)
+ goto badmem;
+ b->perm_n = 0;
+
+ ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+ for (j = 0; j < b->size; j++)
+ b->items[j] = ceph_decode_32(p);
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ err = crush_decode_uniform_bucket(p, end,
+ (struct crush_bucket_uniform *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_LIST:
+ err = crush_decode_list_bucket(p, end,
+ (struct crush_bucket_list *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_TREE:
+ err = crush_decode_tree_bucket(p, end,
+ (struct crush_bucket_tree *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_STRAW:
+ err = crush_decode_straw_bucket(p, end,
+ (struct crush_bucket_straw *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ }
+ }
+
+ /* rules */
+ dout("rule vec is %p\n", c->rules);
+ for (i = 0; i < c->max_rules; i++) {
+ u32 yes;
+ struct crush_rule *r;
+
+ ceph_decode_32_safe(p, end, yes, bad);
+ if (!yes) {
+ dout("crush_decode NO rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+ c->rules[i] = NULL;
+ continue;
+ }
+
+ dout("crush_decode rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ /* len */
+ ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+ if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+ goto bad;
+#endif
+ r = c->rules[i] = kmalloc(sizeof(*r) +
+ yes*sizeof(struct crush_rule_step),
+ GFP_NOFS);
+ if (r == NULL)
+ goto badmem;
+ dout(" rule %d is at %p\n", i, r);
+ r->len = yes;
+ ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+ for (j = 0; j < r->len; j++) {
+ r->steps[j].op = ceph_decode_32(p);
+ r->steps[j].arg1 = ceph_decode_32(p);
+ r->steps[j].arg2 = ceph_decode_32(p);
+ }
+ }
+
+ /* ignore trailing name maps. */
+
+ dout("crush_decode success\n");
+ return c;
+
+badmem:
+ err = -ENOMEM;
+bad:
+ dout("crush_decode fail %d\n", err);
+ crush_destroy(c);
+ return ERR_PTR(err);
+}
+
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+ if (map->crush)
+ crush_destroy(map->crush);
+ while (!RB_EMPTY_ROOT(&map->pg_temp))
+ rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
+ kfree(map->osd_state);
+ kfree(map->osd_weight);
+ kfree(map->pg_pool);
+ kfree(map->osd_addr);
+ kfree(map);
+}
+
+/*
+ * adjust max osd value. reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+ u8 *state;
+ struct ceph_entity_addr *addr;
+ u32 *weight;
+
+ state = kcalloc(max, sizeof(*state), GFP_NOFS);
+ addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+ weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+ if (state == NULL || addr == NULL || weight == NULL) {
+ kfree(state);
+ kfree(addr);
+ kfree(weight);
+ return -ENOMEM;
+ }
+
+ /* copy old? */
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+ memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+ memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+ kfree(map->osd_state);
+ kfree(map->osd_addr);
+ kfree(map->osd_weight);
+ }
+
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
+ map->max_osd = max;
+ return 0;
+}
+
+/*
+ * Insert a new pg_temp mapping
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+ u64 a = *(u64 *)&l;
+ u64 b = *(u64 *)&r;
+
+ if (a < b)
+ return -1;
+ if (a > b)
+ return 1;
+ return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+ struct rb_root *root)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_mapping *pg = NULL;
+ int c;
+
+ while (*p) {
+ parent = *p;
+ pg = rb_entry(parent, struct ceph_pg_mapping, node);
+ c = pgid_cmp(new->pgid, pg->pgid);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+ struct ceph_osdmap *map;
+ u16 version;
+ u32 len, max, i;
+ int err = -EINVAL;
+ void *start = *p;
+
+ dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (map == NULL)
+ return ERR_PTR(-ENOMEM);
+ map->pg_temp = RB_ROOT;
+
+ ceph_decode_16_safe(p, end, version, bad);
+
+ ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+ ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+ map->epoch = ceph_decode_32(p);
+ ceph_decode_copy(p, &map->created, sizeof(map->created));
+ ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+ map->num_pools = ceph_decode_32(p);
+ map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
+ GFP_NOFS);
+ if (!map->pg_pool) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ ceph_decode_32_safe(p, end, max, bad);
+ while (max--) {
+ ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
+ i = ceph_decode_32(p);
+ if (i >= map->num_pools)
+ goto bad;
+ ceph_decode_copy(p, &map->pg_pool[i].v,
+ sizeof(map->pg_pool->v));
+ calc_pg_masks(&map->pg_pool[i]);
+ p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
+ p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
+ * sizeof(u64) * 2;
+ }
+
+ ceph_decode_32_safe(p, end, map->flags, bad);
+
+ max = ceph_decode_32(p);
+
+ /* (re)alloc osd arrays */
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+ /* osds */
+ err = -EINVAL;
+ ceph_decode_need(p, end, 3*sizeof(u32) +
+ map->max_osd*(1 + sizeof(*map->osd_weight) +
+ sizeof(*map->osd_addr)), bad);
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+ *p += 4; /* skip length field (should match max) */
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_weight[i] = ceph_decode_32(p);
+
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+ for (i = 0; i < map->max_osd; i++)
+ ceph_decode_addr(&map->osd_addr[i]);
+
+ /* pg_temp */
+ ceph_decode_32_safe(p, end, len, bad);
+ for (i = 0; i < len; i++) {
+ int n, j;
+ struct ceph_pg pgid;
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ n = ceph_decode_32(p);
+ ceph_decode_need(p, end, n * sizeof(u32), bad);
+ pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+ if (!pg) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pg->pgid = pgid;
+ pg->len = n;
+ for (j = 0; j < n; j++)
+ pg->osds[j] = ceph_decode_32(p);
+
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err)
+ goto bad;
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+ }
+
+ /* crush */
+ ceph_decode_32_safe(p, end, len, bad);
+ dout("osdmap_decode crush len %d from off 0x%x\n", len,
+ (int)(*p - start));
+ ceph_decode_need(p, end, len, bad);
+ map->crush = crush_decode(*p, end);
+ *p += len;
+ if (IS_ERR(map->crush)) {
+ err = PTR_ERR(map->crush);
+ map->crush = NULL;
+ goto bad;
+ }
+
+ /* ignore the rest of the map */
+ *p = end;
+
+ dout("osdmap_decode done %p %p\n", *p, end);
+ return map;
+
+bad:
+ dout("osdmap_decode fail\n");
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr)
+{
+ struct ceph_osdmap *newmap = map;
+ struct crush_map *newcrush = NULL;
+ struct ceph_fsid fsid;
+ u32 epoch = 0;
+ struct ceph_timespec modified;
+ u32 len, pool;
+ __s32 new_flags, max;
+ void *start = *p;
+ int err = -EINVAL;
+ u16 version;
+ struct rb_node *rbp;
+
+ ceph_decode_16_safe(p, end, version, bad);
+
+ ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+ bad);
+ ceph_decode_copy(p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(p);
+ BUG_ON(epoch != map->epoch+1);
+ ceph_decode_copy(p, &modified, sizeof(modified));
+ new_flags = ceph_decode_32(p);
+
+ /* full map? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental full map len %d, %p to %p\n",
+ len, *p, end);
+ newmap = osdmap_decode(p, min(*p+len, end));
+ return newmap; /* error or not */
+ }
+
+ /* new crush? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental new crush map len %d, %p to %p\n",
+ len, *p, end);
+ newcrush = crush_decode(*p, min(*p+len, end));
+ if (IS_ERR(newcrush))
+ return ERR_PTR(PTR_ERR(newcrush));
+ }
+
+ /* new flags? */
+ if (new_flags >= 0)
+ map->flags = new_flags;
+
+ ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+ /* new max? */
+ max = ceph_decode_32(p);
+ if (max >= 0) {
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ }
+
+ map->epoch++;
+ map->modified = map->modified;
+ if (newcrush) {
+ if (map->crush)
+ crush_destroy(map->crush);
+ map->crush = newcrush;
+ newcrush = NULL;
+ }
+
+ /* new_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ ceph_decode_32_safe(p, end, pool, bad);
+ if (pool >= map->num_pools) {
+ void *pg_pool = kcalloc(pool + 1,
+ sizeof(*map->pg_pool),
+ GFP_NOFS);
+ if (!pg_pool) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ memcpy(pg_pool, map->pg_pool,
+ map->num_pools * sizeof(*map->pg_pool));
+ kfree(map->pg_pool);
+ map->pg_pool = pg_pool;
+ map->num_pools = pool+1;
+ }
+ ceph_decode_copy(p, &map->pg_pool[pool].v,
+ sizeof(map->pg_pool->v));
+ calc_pg_masks(&map->pg_pool[pool]);
+ }
+
+ /* old_pool (ignore) */
+ ceph_decode_32_safe(p, end, len, bad);
+ *p += len * sizeof(u32);
+
+ /* new_up */
+ err = -EINVAL;
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ struct ceph_entity_addr addr;
+ ceph_decode_32_safe(p, end, osd, bad);
+ ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+ ceph_decode_addr(&addr);
+ pr_info("osd%d up\n", osd);
+ BUG_ON(osd >= map->max_osd);
+ map->osd_state[osd] |= CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ /* new_down */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ ceph_decode_32_safe(p, end, osd, bad);
+ (*p)++; /* clean flag */
+ pr_info("osd%d down\n", osd);
+ if (osd < map->max_osd)
+ map->osd_state[osd] &= ~CEPH_OSD_UP;
+ }
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd, off;
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ osd = ceph_decode_32(p);
+ off = ceph_decode_32(p);
+ pr_info("osd%d weight 0x%x %s\n", osd, off,
+ off == CEPH_OSD_IN ? "(in)" :
+ (off == CEPH_OSD_OUT ? "(out)" : ""));
+ if (osd < map->max_osd)
+ map->osd_weight[osd] = off;
+ }
+
+ /* new_pg_temp */
+ rbp = rb_first(&map->pg_temp);
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ struct ceph_pg_mapping *pg;
+ int j;
+ struct ceph_pg pgid;
+ u32 pglen;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ pglen = ceph_decode_32(p);
+
+ /* remove any? */
+ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+ node)->pgid, pgid) <= 0) {
+ struct rb_node *cur = rbp;
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n",
+ *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+ node)->pgid);
+ rb_erase(cur, &map->pg_temp);
+ }
+
+ if (pglen) {
+ /* insert */
+ ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+ pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+ if (!pg) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pg->pgid = pgid;
+ pg->len = pglen;
+ for (j = 0; j < len; j++)
+ pg->osds[j] = ceph_decode_32(p);
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err)
+ goto bad;
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+ pglen);
+ }
+ }
+ while (rbp) {
+ struct rb_node *cur = rbp;
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n",
+ *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+ node)->pgid);
+ rb_erase(cur, &map->pg_temp);
+ }
+
+ /* ignore the rest */
+ *p = end;
+ return map;
+
+bad:
+ pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+ epoch, (int)(*p - start), *p, start, end);
+ if (newcrush)
+ crush_destroy(newcrush);
+ return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u64 *ono,
+ u64 *oxoff, u64 *oxlen)
+{
+ u32 osize = le32_to_cpu(layout->fl_object_size);
+ u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 bl, stripeno, stripepos, objsetno;
+ u32 su_per_object;
+ u64 t, su_offset;
+
+ dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
+ osize, su);
+ su_per_object = osize / su;
+ dout("osize %u / su %u = su_per_object %u\n", osize, su,
+ su_per_object);
+
+ BUG_ON((su & ~PAGE_MASK) != 0);
+ /* bl = *off / su; */
+ t = off;
+ do_div(t, su);
+ bl = t;
+ dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+ stripeno = bl / sc;
+ stripepos = bl % sc;
+ objsetno = stripeno / su_per_object;
+
+ *ono = objsetno * sc + stripepos;
+ dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+ /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
+ t = off;
+ su_offset = do_div(t, su);
+ *oxoff = su_offset + (stripeno % su_per_object) * su;
+
+ /*
+ * Calculate the length of the extent being written to the selected
+ * object. This is the minimum of the full length requested (plen) or
+ * the remainder of the current stripe being written to.
+ */
+ *oxlen = min_t(u64, *plen, su - su_offset);
+ *plen = *oxlen;
+
+ dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+ const char *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap)
+{
+ unsigned num, num_mask;
+ struct ceph_pg pgid;
+ s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+ int poolid = le32_to_cpu(fl->fl_pg_pool);
+ struct ceph_pg_pool_info *pool;
+ unsigned ps;
+
+ if (poolid >= osdmap->num_pools)
+ return -EIO;
+
+ pool = &osdmap->pg_pool[poolid];
+ ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+ if (preferred >= 0) {
+ ps += preferred;
+ num = le32_to_cpu(pool->v.lpg_num);
+ num_mask = pool->lpg_num_mask;
+ } else {
+ num = le32_to_cpu(pool->v.pg_num);
+ num_mask = pool->pg_num_mask;
+ }
+
+ pgid.ps = cpu_to_le16(ps);
+ pgid.preferred = cpu_to_le16(preferred);
+ pgid.pool = fl->fl_pg_pool;
+ if (preferred >= 0)
+ dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+ (int)preferred);
+ else
+ dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+ ol->ol_pgid = pgid;
+ ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+ return 0;
+}
+
+/*
+ * Calculate raw osd vector for the given pgid. Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *osds, int *num)
+{
+ struct rb_node *n = osdmap->pg_temp.rb_node;
+ struct ceph_pg_mapping *pg;
+ struct ceph_pg_pool_info *pool;
+ int ruleno;
+ unsigned poolid, ps, pps;
+ int preferred;
+ int c;
+
+ /* pg_temp? */
+ while (n) {
+ pg = rb_entry(n, struct ceph_pg_mapping, node);
+ c = pgid_cmp(pgid, pg->pgid);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else {
+ *num = pg->len;
+ return pg->osds;
+ }
+ }
+
+ /* crush */
+ poolid = le32_to_cpu(pgid.pool);
+ ps = le16_to_cpu(pgid.ps);
+ preferred = (s16)le16_to_cpu(pgid.preferred);
+
+ if (poolid >= osdmap->num_pools)
+ return NULL;
+ pool = &osdmap->pg_pool[poolid];
+ ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+ pool->v.type, pool->v.size);
+ if (ruleno < 0) {
+ pr_err("no crush rule pool %d type %d size %d\n",
+ poolid, pool->v.type, pool->v.size);
+ return NULL;
+ }
+
+ if (preferred >= 0)
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.lpgp_num),
+ pool->lpgp_num_mask);
+ else
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.pgp_num),
+ pool->pgp_num_mask);
+ pps += poolid;
+ *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+ min_t(int, pool->v.size, *num),
+ preferred, osdmap->osd_weight);
+ return osds;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+ int rawosds[10], *osds;
+ int i, num = ARRAY_SIZE(rawosds);
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i])) {
+ return osds[i];
+ break;
+ }
+ return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..c4af8418aa00
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,124 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds. That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+ struct ceph_pg_pool v;
+ int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+
+struct ceph_pg_mapping {
+ struct rb_node node;
+ struct ceph_pg pgid;
+ int len;
+ int osds[];
+};
+
+struct ceph_osdmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 mkfs_epoch;
+ struct ceph_timespec created, modified;
+
+ u32 flags; /* CEPH_OSDMAP_* */
+
+ u32 max_osd; /* size of osd_state, _offload, _addr arrays */
+ u8 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
+ struct ceph_entity_addr *osd_addr;
+
+ struct rb_root pg_temp;
+
+ u32 num_pools;
+ struct ceph_pg_pool_info *pg_pool;
+
+ /* the CRUSH map specifies the mapping of placement groups to
+ * the list of osds that store+replicate them. */
+ struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+ ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+ ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+ ((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+ ((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_stripe_unit) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_object_size) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+ return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+ return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+ int osd)
+{
+ if (osd >= map->max_osd)
+ return NULL;
+ return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+ const char *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+ struct ceph_pg pgid);
+
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..12bfb2f7c275
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,370 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP 1
+#define CEPH_PG_TYPE_RAID4 2
+struct ceph_pg_pool {
+ __u8 type; /* CEPH_PG_TYPE_* */
+ __u8 size; /* number of osds in each pg */
+ __u8 crush_ruleset; /* crush placement rule */
+ __u8 object_hash; /* hash mapping object name to ps */
+ __le32 pg_num, pgp_num; /* number of pg's */
+ __le32 lpg_num, lpgp_num; /* number of localized pg's */
+ __le32 last_change; /* most recent epoch changed */
+ __le64 snap_seq; /* seq for per-pool snapshot */
+ __le32 snap_epoch; /* epoch of last snap */
+ __le32 num_snaps;
+ __le32 num_removed_snap_intervals;
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP 2
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK 0x0100
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+
+enum {
+ /** data **/
+ /* read */
+ CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+ /* fancy read */
+ CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+ /* write */
+ CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+ CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+ CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+ CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+ /* fancy write */
+ CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+ CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+ CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+ CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+ CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+ CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+ CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+ CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+ /** attrs **/
+ /* read */
+ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+ /* write */
+ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+ CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+ CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+ /** subop **/
+ CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
+ CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
+ CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
+ CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+ CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
+
+ /** lock **/
+ CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+ CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+ CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+ CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+ CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+ CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+ /** exec **/
+ CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+ /** pg **/
+ CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM 'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 16, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 32, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 256,
+ CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
+};
+
+#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) trunc;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 cookie, count;
+ } __attribute__ ((packed)) pgls;
+ };
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header. each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+ __le64 tid; /* transaction id */
+ __le32 client_inc; /* client incarnation */
+ struct ceph_object_layout layout; /* pgid */
+ __le32 osdmap_epoch; /* client's osdmap epoch */
+
+ __le32 flags;
+
+ struct ceph_timespec mtime; /* for mutations only */
+ struct ceph_eversion reassert_version; /* if we are replaying op */
+
+ __le32 object_len; /* length of object name */
+
+ __le64 snapid; /* snapid to read */
+ __le64 snap_seq; /* writer's snap context */
+ __le32 num_snaps;
+
+ __le16 num_ops;
+ struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+ __le64 tid; /* transaction id */
+ __le32 client_inc; /* client incarnation */
+ __le32 flags;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+ __le32 result; /* result code */
+
+ __le32 object_len; /* length of object name */
+ __le32 num_ops;
+ struct ceph_osd_op ops[0]; /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..52f46a1208f5
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,887 @@
+#include "ceph_debug.h"
+
+#include <linux/radix-tree.h>
+#include <linux/sort.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client. In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot. Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide. Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory. This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots. An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename). Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system. (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm. This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here. If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("get_realm %p %d -> %d\n", realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ /*
+ * since we _only_ increment realm refs or empty the empty
+ * list with snap_rwsem held, adjusting the empty list here is
+ * safe. we do need to protect against concurrent empty list
+ * additions, however.
+ */
+ if (atomic_read(&realm->nref) == 0) {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_del_init(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+
+ atomic_inc(&realm->nref);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+ struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *realm;
+
+ realm = kzalloc(sizeof(*realm), GFP_NOFS);
+ if (!realm)
+ return ERR_PTR(-ENOMEM);
+
+ radix_tree_insert(&mdsc->snap_realms, ino, realm);
+
+ atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ realm->ino = ino;
+ INIT_LIST_HEAD(&realm->children);
+ INIT_LIST_HEAD(&realm->child_item);
+ INIT_LIST_HEAD(&realm->empty_item);
+ INIT_LIST_HEAD(&realm->inodes_with_caps);
+ spin_lock_init(&realm->inodes_with_caps_lock);
+ dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ return realm;
+}
+
+/*
+ * find and get (if found) the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *realm;
+
+ realm = radix_tree_lookup(&mdsc->snap_realms, ino);
+ if (realm)
+ dout("lookup_snap_realm %llx %p\n", realm->ino, realm);
+ return realm;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+ radix_tree_delete(&mdsc->snap_realms, realm->ino);
+
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ __put_snap_realm(mdsc, realm->parent);
+ }
+
+ kfree(realm->prior_parent_snaps);
+ kfree(realm->snaps);
+ ceph_put_snap_context(realm->cached_context);
+ kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (atomic_dec_and_test(&realm->nref))
+ __destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (!atomic_dec_and_test(&realm->nref))
+ return;
+
+ if (down_write_trylock(&mdsc->snap_rwsem)) {
+ __destroy_snap_realm(mdsc, realm);
+ up_write(&mdsc->snap_rwsem);
+ } else {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_add(&mdsc->snap_empty, &realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero. Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snap_realm *realm;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ while (!list_empty(&mdsc->snap_empty)) {
+ realm = list_first_entry(&mdsc->snap_empty,
+ struct ceph_snap_realm, empty_item);
+ list_del(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ __destroy_snap_realm(mdsc, realm);
+ spin_lock(&mdsc->snap_empty_lock);
+ }
+ spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ down_write(&mdsc->snap_rwsem);
+ __cleanup_empty_realms(mdsc);
+ up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm. adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
+ u64 parentino)
+{
+ struct ceph_snap_realm *parent;
+
+ if (realm->parent_ino == parentino)
+ return 0;
+
+ parent = ceph_lookup_snap_realm(mdsc, parentino);
+ if (!parent) {
+ parent = ceph_create_snap_realm(mdsc, parentino);
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+ }
+ dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+ realm->ino, realm, realm->parent_ino, realm->parent,
+ parentino, parent);
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ ceph_put_snap_realm(mdsc, realm->parent);
+ }
+ realm->parent_ino = parentino;
+ realm->parent = parent;
+ ceph_get_snap_realm(mdsc, parent);
+ list_add(&realm->child_item, &parent->children);
+ return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+ if (*(u64 *)a < *(u64 *)b)
+ return 1;
+ if (*(u64 *)a > *(u64 *)b)
+ return -1;
+ return 0;
+}
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+ struct ceph_snap_realm *parent = realm->parent;
+ struct ceph_snap_context *snapc;
+ int err = 0;
+ int i;
+ int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+ /*
+ * build parent context, if it hasn't been built.
+ * conservatively estimate that all parent snaps might be
+ * included by us.
+ */
+ if (parent) {
+ if (!parent->cached_context) {
+ err = build_snap_context(parent);
+ if (err)
+ goto fail;
+ }
+ num += parent->cached_context->num_snaps;
+ }
+
+ /* do i actually need to update? not if my context seq
+ matches realm seq, and my parents' does to. (this works
+ because we rebuild_snap_realms() works _downward_ in
+ hierarchy after each update.) */
+ if (realm->cached_context &&
+ realm->cached_context->seq <= realm->seq &&
+ (!parent ||
+ realm->cached_context->seq <= parent->cached_context->seq)) {
+ dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+ " (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ realm->cached_context->num_snaps);
+ return 0;
+ }
+
+ /* alloc new snap context */
+ err = -ENOMEM;
+ if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+ goto fail;
+ snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+ if (!snapc)
+ goto fail;
+ atomic_set(&snapc->nref, 1);
+
+ /* build (reverse sorted) snap vector */
+ num = 0;
+ snapc->seq = realm->seq;
+ if (parent) {
+ /* include any of parent's snaps occuring _after_ my
+ parent became my parent */
+ for (i = 0; i < parent->cached_context->num_snaps; i++)
+ if (parent->cached_context->snaps[i] >=
+ realm->parent_since)
+ snapc->snaps[num++] =
+ parent->cached_context->snaps[i];
+ if (parent->cached_context->seq > snapc->seq)
+ snapc->seq = parent->cached_context->seq;
+ }
+ memcpy(snapc->snaps + num, realm->snaps,
+ sizeof(u64)*realm->num_snaps);
+ num += realm->num_snaps;
+ memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+ sizeof(u64)*realm->num_prior_parent_snaps);
+ num += realm->num_prior_parent_snaps;
+
+ sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+ snapc->num_snaps = num;
+ dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+ realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+ if (realm->cached_context)
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = snapc;
+ return 0;
+
+fail:
+ /*
+ * if we fail, clear old (incorrect) cached_context... hopefully
+ * we'll have better luck building it later
+ */
+ if (realm->cached_context) {
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = NULL;
+ }
+ pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+ realm, err);
+ return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+ struct ceph_snap_realm *child;
+
+ dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+ build_snap_context(realm);
+
+ list_for_each_entry(child, &realm->children, child_item)
+ rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids. free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+ int i;
+
+ kfree(*dst);
+ if (num) {
+ *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+ if (!*dst)
+ return -ENOMEM;
+ for (i = 0; i < num; i++)
+ (*dst)[i] = get_unaligned_le64(src + i);
+ } else {
+ *dst = NULL;
+ }
+ return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known). In this case the
+ * cap_snap->writing = 1, and is said to be "pending." When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_snap_context *snapc)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap_snap *capsnap;
+ int used;
+
+ capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+ return;
+ }
+
+ spin_lock(&inode->i_lock);
+ used = __ceph_caps_used(ci);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ /* there is no point in queuing multiple "pending" cap_snaps,
+ as no new writes are allowed to start when pending, so any
+ writes in progress now were started before the previous
+ cap_snap. lucky us. */
+ dout("queue_cap_snap %p snapc %p seq %llu used %d"
+ " already pending\n", inode, snapc, snapc->seq, used);
+ kfree(capsnap);
+ } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+ igrab(inode);
+
+ atomic_set(&capsnap->nref, 1);
+ capsnap->ci = ci;
+ INIT_LIST_HEAD(&capsnap->ci_item);
+ INIT_LIST_HEAD(&capsnap->flushing_item);
+
+ capsnap->follows = snapc->seq - 1;
+ capsnap->context = ceph_get_snap_context(snapc);
+ capsnap->issued = __ceph_caps_issued(ci, NULL);
+ capsnap->dirty = __ceph_caps_dirty(ci);
+
+ capsnap->mode = inode->i_mode;
+ capsnap->uid = inode->i_uid;
+ capsnap->gid = inode->i_gid;
+
+ /* fixme? */
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_len = 0;
+
+ /* dirty page count moved from _head to this cap_snap;
+ all subsequent writes page dirties occur _after_ this
+ snapshot. */
+ capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+ ci->i_wrbuffer_ref_head = 0;
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+ if (used & CEPH_CAP_FILE_WR) {
+ dout("queue_cap_snap %p cap_snap %p snapc %p"
+ " seq %llu used WR, now pending\n", inode,
+ capsnap, snapc, snapc->seq);
+ capsnap->writing = 1;
+ } else {
+ /* note mtime, size NOW. */
+ __ceph_finish_cap_snap(ci, capsnap);
+ }
+ } else {
+ dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ kfree(capsnap);
+ }
+
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+ BUG_ON(capsnap->writing);
+ capsnap->size = inode->i_size;
+ capsnap->mtime = inode->i_mtime;
+ capsnap->atime = inode->i_atime;
+ capsnap->ctime = inode->i_ctime;
+ capsnap->time_warp_seq = ci->i_time_warp_seq;
+ if (capsnap->dirty_pages) {
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+ "still has %d dirty pages\n", inode, capsnap,
+ capsnap->context, capsnap->context->seq,
+ capsnap->size, capsnap->dirty_pages);
+ return 0;
+ }
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+ inode, capsnap, capsnap->context,
+ capsnap->context->seq, capsnap->size);
+
+ spin_lock(&mdsc->snap_flush_lock);
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ spin_unlock(&mdsc->snap_flush_lock);
+ return 1; /* caller may want to ceph_flush_snaps */
+}
+
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS. This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+ void *p, void *e, bool deletion)
+{
+ struct ceph_mds_snap_realm *ri; /* encoded */
+ __le64 *snaps; /* encoded */
+ __le64 *prior_parent_snaps; /* encoded */
+ struct ceph_snap_realm *realm;
+ int invalidate = 0;
+ int err = -ENOMEM;
+
+ dout("update_snap_trace deletion=%d\n", deletion);
+more:
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ ri = p;
+ p += sizeof(*ri);
+ ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+ le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+ snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+ prior_parent_snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+ realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (IS_ERR(realm)) {
+ err = PTR_ERR(realm);
+ goto fail;
+ }
+ }
+
+ if (le64_to_cpu(ri->seq) > realm->seq) {
+ dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ /*
+ * if the realm seq has changed, queue a cap_snap for every
+ * inode with open caps. we do this _before_ we update
+ * the realm info so that we prepare for writeback under the
+ * _previous_ snap context.
+ *
+ * ...unless it's a snap deletion!
+ */
+ if (!deletion) {
+ struct ceph_inode_info *ci;
+ struct inode *lastinode = NULL;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_for_each_entry(ci, &realm->inodes_with_caps,
+ i_snap_realm_item) {
+ struct inode *inode = igrab(&ci->vfs_inode);
+ if (!inode)
+ continue;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+ lastinode = inode;
+ ceph_queue_cap_snap(ci, realm->cached_context);
+ spin_lock(&realm->inodes_with_caps_lock);
+ }
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+ dout("update_snap_trace cap_snaps queued\n");
+ }
+
+ } else {
+ dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ realm->ino, realm, realm->seq);
+ }
+
+ /* ensure the parent is correct */
+ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+ if (err < 0)
+ goto fail;
+ invalidate += err;
+
+ if (le64_to_cpu(ri->seq) > realm->seq) {
+ /* update realm parameters, snap lists */
+ realm->seq = le64_to_cpu(ri->seq);
+ realm->created = le64_to_cpu(ri->created);
+ realm->parent_since = le64_to_cpu(ri->parent_since);
+
+ realm->num_snaps = le32_to_cpu(ri->num_snaps);
+ err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+ if (err < 0)
+ goto fail;
+
+ realm->num_prior_parent_snaps =
+ le32_to_cpu(ri->num_prior_parent_snaps);
+ err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+ realm->num_prior_parent_snaps);
+ if (err < 0)
+ goto fail;
+
+ invalidate = 1;
+ } else if (!realm->cached_context) {
+ invalidate = 1;
+ }
+
+ dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+ realm, invalidate, p, e);
+
+ if (p < e)
+ goto more;
+
+ /* invalidate when we reach the _end_ (root) of the trace */
+ if (invalidate)
+ rebuild_snap_realms(realm);
+
+ __cleanup_empty_realms(mdsc);
+ return 0;
+
+bad:
+ err = -EINVAL;
+fail:
+ pr_err("update_snap_trace error %d\n", err);
+ return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush. Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+ struct ceph_mds_session *session = NULL;
+
+ dout("flush_snaps\n");
+ spin_lock(&mdsc->snap_flush_lock);
+ while (!list_empty(&mdsc->snap_flush_list)) {
+ ci = list_first_entry(&mdsc->snap_flush_list,
+ struct ceph_inode_info, i_snap_flush_item);
+ inode = &ci->vfs_inode;
+ igrab(inode);
+ spin_unlock(&mdsc->snap_flush_lock);
+ spin_lock(&inode->i_lock);
+ __ceph_flush_snaps(ci, &session);
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ spin_lock(&mdsc->snap_flush_lock);
+ }
+ spin_unlock(&mdsc->snap_flush_lock);
+
+ if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm. This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->client->sb;
+ struct ceph_mds_session *session;
+ int mds;
+ u64 split;
+ int op;
+ int trace_len;
+ struct ceph_snap_realm *realm = NULL;
+ void *p = msg->front.iov_base;
+ void *e = p + msg->front.iov_len;
+ struct ceph_mds_snap_head *h;
+ int num_split_inos, num_split_realms;
+ __le64 *split_inos = NULL, *split_realms = NULL;
+ int i;
+ int locked_rwsem = 0;
+
+ if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+ return;
+ mds = le64_to_cpu(msg->hdr.src.name.num);
+
+ /* decode */
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ h = p;
+ op = le32_to_cpu(h->op);
+ split = le64_to_cpu(h->split); /* non-zero if we are splitting an
+ * existing realm */
+ num_split_inos = le32_to_cpu(h->num_split_inos);
+ num_split_realms = le32_to_cpu(h->num_split_realms);
+ trace_len = le32_to_cpu(h->trace_len);
+ p += sizeof(*h);
+
+ dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);
+
+ /* find session */
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (!session) {
+ dout("WTF, got snap but no session for mds%d\n", mds);
+ return;
+ }
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ mutex_unlock(&session->s_mutex);
+
+ down_write(&mdsc->snap_rwsem);
+ locked_rwsem = 1;
+
+ if (op == CEPH_SNAP_OP_SPLIT) {
+ struct ceph_mds_snap_realm *ri;
+
+ /*
+ * A "split" breaks part of an existing realm off into
+ * a new realm. The MDS provides a list of inodes
+ * (with caps) and child realms that belong to the new
+ * child.
+ */
+ split_inos = p;
+ p += sizeof(u64) * num_split_inos;
+ split_realms = p;
+ p += sizeof(u64) * num_split_realms;
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ /* we will peek at realm info here, but will _not_
+ * advance p, as the realm update will occur below in
+ * ceph_update_snap_trace. */
+ ri = p;
+
+ realm = ceph_lookup_snap_realm(mdsc, split);
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, split);
+ if (IS_ERR(realm))
+ goto out;
+ }
+ ceph_get_snap_realm(mdsc, realm);
+
+ dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ for (i = 0; i < num_split_inos; i++) {
+ struct ceph_vino vino = {
+ .ino = le64_to_cpu(split_inos[i]),
+ .snap = CEPH_NOSNAP,
+ };
+ struct inode *inode = ceph_find_inode(sb, vino);
+ struct ceph_inode_info *ci;
+
+ if (!inode)
+ continue;
+ ci = ceph_inode(inode);
+
+ spin_lock(&inode->i_lock);
+ if (!ci->i_snap_realm)
+ goto skip_inode;
+ /*
+ * If this inode belongs to a realm that was
+ * created after our new realm, we experienced
+ * a race (due to another split notifications
+ * arriving from a different MDS). So skip
+ * this inode.
+ */
+ if (ci->i_snap_realm->created >
+ le64_to_cpu(ri->created)) {
+ dout(" leaving %p in newer realm %llx %p\n",
+ inode, ci->i_snap_realm->ino,
+ ci->i_snap_realm);
+ goto skip_inode;
+ }
+ dout(" will move %p to split realm %llx %p\n",
+ inode, realm->ino, realm);
+ /*
+ * Remove the inode from the realm's inode
+ * list, but don't add it to the new realm
+ * yet. We don't want the cap_snap to be
+ * queued (again) by ceph_update_snap_trace()
+ * below. Queue it _now_, under the old context.
+ */
+ list_del_init(&ci->i_snap_realm_item);
+ spin_unlock(&inode->i_lock);
+
+ ceph_queue_cap_snap(ci,
+ ci->i_snap_realm->cached_context);
+
+ iput(inode);
+ continue;
+
+skip_inode:
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ }
+
+ /* we may have taken some of the old realm's children. */
+ for (i = 0; i < num_split_realms; i++) {
+ struct ceph_snap_realm *child =
+ ceph_lookup_snap_realm(mdsc,
+ le64_to_cpu(split_realms[i]));
+ if (!child)
+ continue;
+ adjust_snap_realm_parent(mdsc, child, realm->ino);
+ }
+ }
+
+ /*
+ * update using the provided snap trace. if we are deleting a
+ * snap, we can avoid queueing cap_snaps.
+ */
+ ceph_update_snap_trace(mdsc, p, e,
+ op == CEPH_SNAP_OP_DESTROY);
+
+ if (op == CEPH_SNAP_OP_SPLIT) {
+ /*
+ * ok, _now_ add the inodes into the new realm.
+ */
+ for (i = 0; i < num_split_inos; i++) {
+ struct ceph_vino vino = {
+ .ino = le64_to_cpu(split_inos[i]),
+ .snap = CEPH_NOSNAP,
+ };
+ struct inode *inode = ceph_find_inode(sb, vino);
+ struct ceph_inode_info *ci;
+
+ if (!inode)
+ continue;
+ ci = ceph_inode(inode);
+ spin_lock(&inode->i_lock);
+ if (!ci->i_snap_realm)
+ goto split_skip_inode;
+ ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ ci->i_snap_realm = realm;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_get_snap_realm(mdsc, realm);
+split_skip_inode:
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ }
+
+ /* we took a reference when we created the realm, above */
+ ceph_put_snap_realm(mdsc, realm);
+ }
+
+ __cleanup_empty_realms(mdsc);
+
+ up_write(&mdsc->snap_rwsem);
+
+ flush_snaps(mdsc);
+ return;
+
+bad:
+ pr_err("corrupt snap message from mds%d\n", mds);
+out:
+ if (locked_rwsem)
+ up_write(&mdsc->snap_rwsem);
+ return;
+}
+
+
+
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..a828943296c5
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,984 @@
+
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+ const char *e = s + len;
+
+ while (e != s && *(e-1) != '/')
+ e--;
+ return e;
+}
+
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+ struct ceph_client *cl = ceph_client(s);
+
+ dout("put_super\n");
+ ceph_mdsc_close_sessions(&cl->mdsc);
+ return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+ struct ceph_monmap *monmap = client->monc.monmap;
+ struct ceph_statfs st;
+ u64 fsid;
+ int err;
+
+ dout("statfs\n");
+ err = ceph_monc_do_statfs(&client->monc, &st);
+ if (err < 0)
+ return err;
+
+ /* fill in kstatfs */
+ buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
+
+ /*
+ * express utilization in terms of large blocks to avoid
+ * overflow on 32-bit machines.
+ */
+ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+ (CEPH_BLOCK_SHIFT-10);
+ buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+ buf->f_files = le64_to_cpu(st.num_objects);
+ buf->f_ffree = -1;
+ buf->f_namelen = PATH_MAX;
+ buf->f_frsize = PAGE_CACHE_SIZE;
+
+ /* leave fsid little-endian, regardless of host endianness */
+ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+ buf->f_fsid.val[0] = fsid & 0xffffffff;
+ buf->f_fsid.val[1] = fsid >> 32;
+
+ return 0;
+}
+
+
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+ dout("sync_fs %d\n", wait);
+ ceph_osdc_sync(&ceph_client(sb)->osdc);
+ ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+ dout("sync_fs %d done\n", wait);
+ return 0;
+}
+
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+ struct ceph_mount_args *args = client->mount_args;
+
+ if (args->flags & CEPH_OPT_FSID)
+ seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+ le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+ le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+ if (args->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, ",noshare");
+ if (args->flags & CEPH_OPT_DIRSTAT)
+ seq_puts(m, ",dirstat");
+ if ((args->flags & CEPH_OPT_RBYTES) == 0)
+ seq_puts(m, ",norbytes");
+ if (args->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, ",nocrc");
+ if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+ seq_puts(m, ",noasyncreaddir");
+ if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+ seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+ if (args->name)
+ seq_printf(m, ",name=%s", args->name);
+ if (args->secret)
+ seq_puts(m, ",secret=<hidden>");
+ return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+ struct ceph_inode_info *ci = foo;
+ inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+ ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+ sizeof(struct ceph_inode_info),
+ __alignof__(struct ceph_inode_info),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ ceph_inode_init_once);
+ if (ceph_inode_cachep == NULL)
+ return -ENOMEM;
+
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_cap_cachep == NULL)
+ goto bad_cap;
+
+ ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_dentry_cachep == NULL)
+ goto bad_dentry;
+
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_file_cachep == NULL)
+ goto bad_file;
+
+ return 0;
+
+bad_file:
+ kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+ kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+ kmem_cache_destroy(ceph_inode_cachep);
+ return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+ kmem_cache_destroy(ceph_inode_cachep);
+ kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_dentry_cachep);
+ kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount. Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+ struct ceph_client *client = ceph_sb_to_client(sb);
+
+ dout("ceph_umount_begin - starting forced umount\n");
+ if (!client)
+ return;
+ client->mount_state = CEPH_MOUNT_SHUTDOWN;
+ return;
+}
+
+static const struct super_operations ceph_super_ops = {
+ .alloc_inode = ceph_alloc_inode,
+ .destroy_inode = ceph_destroy_inode,
+ .write_inode = ceph_write_inode,
+ .sync_fs = ceph_syncfs,
+ .put_super = ceph_put_super,
+ .show_options = ceph_show_options,
+ .statfs = ceph_statfs,
+ .umount_begin = ceph_umount_begin,
+};
+
+
+const char *ceph_msg_type_name(int type)
+{
+ switch (type) {
+ case CEPH_MSG_SHUTDOWN: return "shutdown";
+ case CEPH_MSG_PING: return "ping";
+ case CEPH_MSG_AUTH: return "auth";
+ case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+ case CEPH_MSG_MON_MAP: return "mon_map";
+ case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+ case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+ case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+ case CEPH_MSG_STATFS: return "statfs";
+ case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+ case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_CLIENT_SESSION: return "client_session";
+ case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+ case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+ case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+ case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+ case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+ case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_OSD_MAP: return "osd_map";
+ case CEPH_MSG_OSD_OP: return "osd_op";
+ case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+ default: return "unknown";
+ }
+}
+
+
+/*
+ * mount options
+ */
+enum {
+ Opt_fsidmajor,
+ Opt_fsidminor,
+ Opt_monport,
+ Opt_wsize,
+ Opt_rsize,
+ Opt_osdtimeout,
+ Opt_mount_timeout,
+ Opt_caps_wanted_delay_min,
+ Opt_caps_wanted_delay_max,
+ Opt_readdir_max_entries,
+ Opt_last_int,
+ /* int args above */
+ Opt_snapdirname,
+ Opt_name,
+ Opt_secret,
+ Opt_last_string,
+ /* string args above */
+ Opt_ip,
+ Opt_noshare,
+ Opt_dirstat,
+ Opt_nodirstat,
+ Opt_rbytes,
+ Opt_norbytes,
+ Opt_nocrc,
+ Opt_noasyncreaddir,
+};
+
+static match_table_t arg_tokens = {
+ {Opt_fsidmajor, "fsidmajor=%ld"},
+ {Opt_fsidminor, "fsidminor=%ld"},
+ {Opt_monport, "monport=%d"},
+ {Opt_wsize, "wsize=%d"},
+ {Opt_rsize, "rsize=%d"},
+ {Opt_osdtimeout, "osdtimeout=%d"},
+ {Opt_mount_timeout, "mount_timeout=%d"},
+ {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+ {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+ /* int args above */
+ {Opt_snapdirname, "snapdirname=%s"},
+ {Opt_name, "name=%s"},
+ {Opt_secret, "secret=%s"},
+ /* string args above */
+ {Opt_ip, "ip=%s"},
+ {Opt_noshare, "noshare"},
+ {Opt_dirstat, "dirstat"},
+ {Opt_nodirstat, "nodirstat"},
+ {Opt_rbytes, "rbytes"},
+ {Opt_norbytes, "norbytes"},
+ {Opt_nocrc, "nocrc"},
+ {Opt_noasyncreaddir, "noasyncreaddir"},
+ {-1, NULL}
+};
+
+
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+ const char *dev_name,
+ const char **path)
+{
+ struct ceph_mount_args *args;
+ const char *c;
+ int err = -ENOMEM;
+ substring_t argstr[MAX_OPT_ARGS];
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return ERR_PTR(-ENOMEM);
+ args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+ GFP_KERNEL);
+ if (!args->mon_addr)
+ goto out;
+
+ dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+
+ /* start with defaults */
+ args->sb_flags = flags;
+ args->flags = CEPH_OPT_DEFAULT;
+ args->osd_timeout = 5; /* seconds */
+ args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+ args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+ args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+ args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+ args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+ args->max_readdir = 1024;
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);
+ goto out;
+ }
+
+ /* get mon ip(s) */
+ err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+ CEPH_MAX_MON, &args->num_mon);
+ if (err < 0)
+ goto out;
+
+ /* path on server */
+ *path += 2;
+ dout("server path '%s'\n", *path);
+
+ /* parse mount options */
+ while ((c = strsep(&options, ",")) != NULL) {
+ int token, intval, ret;
+ if (!*c)
+ continue;
+ err = -EINVAL;
+ token = match_token((char *)c, arg_tokens, argstr);
+ if (token < 0) {
+ pr_err("bad mount option at '%s'\n", c);
+ goto out;
+ }
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ continue;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
+ }
+ switch (token) {
+ case Opt_fsidmajor:
+ *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+ break;
+ case Opt_fsidminor:
+ *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+ break;
+ case Opt_ip:
+ err = ceph_parse_ips(argstr[0].from,
+ argstr[0].to,
+ &args->my_addr,
+ 1, NULL);
+ if (err < 0)
+ goto out;
+ args->flags |= CEPH_OPT_MYIP;
+ break;
+
+ case Opt_snapdirname:
+ kfree(args->snapdir_name);
+ args->snapdir_name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_name:
+ args->name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_secret:
+ args->secret = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+
+ /* misc */
+ case Opt_wsize:
+ args->wsize = intval;
+ break;
+ case Opt_rsize:
+ args->rsize = intval;
+ break;
+ case Opt_osdtimeout:
+ args->osd_timeout = intval;
+ break;
+ case Opt_mount_timeout:
+ args->mount_timeout = intval;
+ break;
+ case Opt_caps_wanted_delay_min:
+ args->caps_wanted_delay_min = intval;
+ break;
+ case Opt_caps_wanted_delay_max:
+ args->caps_wanted_delay_max = intval;
+ break;
+ case Opt_readdir_max_entries:
+ args->max_readdir = intval;
+ break;
+
+ case Opt_noshare:
+ args->flags |= CEPH_OPT_NOSHARE;
+ break;
+
+ case Opt_dirstat:
+ args->flags |= CEPH_OPT_DIRSTAT;
+ break;
+ case Opt_nodirstat:
+ args->flags &= ~CEPH_OPT_DIRSTAT;
+ break;
+ case Opt_rbytes:
+ args->flags |= CEPH_OPT_RBYTES;
+ break;
+ case Opt_norbytes:
+ args->flags &= ~CEPH_OPT_RBYTES;
+ break;
+ case Opt_nocrc:
+ args->flags |= CEPH_OPT_NOCRC;
+ break;
+ case Opt_noasyncreaddir:
+ args->flags |= CEPH_OPT_NOASYNCREADDIR;
+ break;
+
+ default:
+ BUG_ON(token);
+ }
+ }
+ return args;
+
+out:
+ kfree(args->mon_addr);
+ kfree(args);
+ return ERR_PTR(err);
+}
+
+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+ dout("destroy_mount_args %p\n", args);
+ kfree(args->snapdir_name);
+ args->snapdir_name = NULL;
+ kfree(args->name);
+ args->name = NULL;
+ kfree(args->secret);
+ args->secret = NULL;
+ kfree(args);
+}
+
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+ struct ceph_client *client;
+ int err = -ENOMEM;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (client == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&client->mount_mutex);
+
+ init_waitqueue_head(&client->mount_wq);
+
+ client->sb = NULL;
+ client->mount_state = CEPH_MOUNT_MOUNTING;
+ client->mount_args = args;
+
+ client->msgr = NULL;
+
+ client->mount_err = 0;
+
+ err = bdi_init(&client->backing_dev_info);
+ if (err < 0)
+ goto fail;
+
+ err = -ENOMEM;
+ client->wb_wq = create_workqueue("ceph-writeback");
+ if (client->wb_wq == NULL)
+ goto fail_bdi;
+ client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ if (client->pg_inv_wq == NULL)
+ goto fail_wb_wq;
+ client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ if (client->trunc_wq == NULL)
+ goto fail_pg_inv_wq;
+
+ /* set up mempools */
+ err = -ENOMEM;
+ client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+ client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+ if (!client->wb_pagevec_pool)
+ goto fail_trunc_wq;
+
+
+ /* subsystems */
+ err = ceph_monc_init(&client->monc, client);
+ if (err < 0)
+ goto fail_mempool;
+ err = ceph_osdc_init(&client->osdc, client);
+ if (err < 0)
+ goto fail_monc;
+ err = ceph_mdsc_init(&client->mdsc, client);
+ if (err < 0)
+ goto fail_osdc;
+ return client;
+
+fail_osdc:
+ ceph_osdc_stop(&client->osdc);
+fail_monc:
+ ceph_monc_stop(&client->monc);
+fail_mempool:
+ mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+ destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+ destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+ destroy_workqueue(client->wb_wq);
+fail_bdi:
+ bdi_destroy(&client->backing_dev_info);
+fail:
+ kfree(client);
+ return ERR_PTR(err);
+}
+
+static void ceph_destroy_client(struct ceph_client *client)
+{
+ dout("destroy_client %p\n", client);
+
+ /* unmount */
+ ceph_mdsc_stop(&client->mdsc);
+ ceph_monc_stop(&client->monc);
+ ceph_osdc_stop(&client->osdc);
+
+ ceph_debugfs_client_cleanup(client);
+ destroy_workqueue(client->wb_wq);
+ destroy_workqueue(client->pg_inv_wq);
+ destroy_workqueue(client->trunc_wq);
+
+ if (client->msgr)
+ ceph_messenger_destroy(client->msgr);
+ mempool_destroy(client->wb_pagevec_pool);
+
+ destroy_mount_args(client->mount_args);
+
+ kfree(client);
+ dout("destroy_client %p done\n", client);
+}
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+ if (client->have_fsid) {
+ if (ceph_fsid_compare(&client->fsid, fsid)) {
+ print_hex_dump(KERN_ERR, "this fsid: ",
+ DUMP_PREFIX_NONE, 16, 1,
+ (void *)fsid, 16, 0);
+ print_hex_dump(KERN_ERR, " old fsid: ",
+ DUMP_PREFIX_NONE, 16, 1,
+ (void *)&client->fsid, 16, 0);
+ pr_err("fsid mismatch\n");
+ return -1;
+ }
+ } else {
+ pr_info("client%lld fsid " FSID_FORMAT "\n",
+ client->monc.auth->global_id, PR_FSID(fsid));
+ memcpy(&client->fsid, fsid, sizeof(*fsid));
+ ceph_debugfs_client_init(client);
+ client->have_fsid = true;
+ }
+ return 0;
+}
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+ return client->monc.monmap && client->monc.monmap->epoch;
+}
+
+/*
+ * Bootstrap mount by opening the root directory. Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+ const char *path,
+ unsigned long started)
+{
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req = NULL;
+ int err;
+ struct dentry *root;
+
+ /* open dir */
+ dout("open_root_inode opening '%s'\n", path);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+ req->r_path1 = kstrdup(path, GFP_NOFS);
+ req->r_ino1.ino = CEPH_INO_ROOT;
+ req->r_ino1.snap = CEPH_NOSNAP;
+ req->r_started = started;
+ req->r_timeout = client->mount_args->mount_timeout * HZ;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == 0) {
+ dout("open_root_inode success\n");
+ if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+ client->sb->s_root == NULL)
+ root = d_alloc_root(req->r_target_inode);
+ else
+ root = d_obtain_alias(req->r_target_inode);
+ req->r_target_inode = NULL;
+ dout("open_root_inode success, root dentry is %p\n", root);
+ } else {
+ root = ERR_PTR(err);
+ }
+ ceph_mdsc_put_request(req);
+ return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+ const char *path)
+{
+ struct ceph_entity_addr *myaddr = NULL;
+ int err;
+ unsigned long timeout = client->mount_args->mount_timeout * HZ;
+ unsigned long started = jiffies; /* note the start time */
+ struct dentry *root;
+
+ dout("mount start\n");
+ mutex_lock(&client->mount_mutex);
+
+ /* initialize the messenger */
+ if (client->msgr == NULL) {
+ if (ceph_test_opt(client, MYIP))
+ myaddr = &client->mount_args->my_addr;
+ client->msgr = ceph_messenger_create(myaddr);
+ if (IS_ERR(client->msgr)) {
+ err = PTR_ERR(client->msgr);
+ client->msgr = NULL;
+ goto out;
+ }
+ client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+ }
+
+ /* open session, and wait for mon, mds, and osd maps */
+ err = ceph_monc_open_session(&client->monc);
+ if (err < 0)
+ goto out;
+
+ while (!have_mon_map(client)) {
+ err = -EIO;
+ if (timeout && time_after_eq(jiffies, started + timeout))
+ goto out;
+
+ /* wait */
+ dout("mount waiting for mon_map\n");
+ err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */
+ have_mon_map(client) || (client->mount_err < 0),
+ timeout);
+ if (err == -EINTR || err == -ERESTARTSYS)
+ goto out;
+ if (client->mount_err < 0) {
+ err = client->mount_err;
+ goto out;
+ }
+ }
+
+ dout("mount opening root\n");
+ root = open_root_dentry(client, "", started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+ if (client->sb->s_root)
+ dput(root);
+ else
+ client->sb->s_root = root;
+
+ if (path[0] == 0) {
+ dget(root);
+ } else {
+ dout("mount opening base mountpoint\n");
+ root = open_root_dentry(client, path, started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ dput(client->sb->s_root);
+ client->sb->s_root = NULL;
+ goto out;
+ }
+ }
+
+ mnt->mnt_root = root;
+ mnt->mnt_sb = client->sb;
+
+ client->mount_state = CEPH_MOUNT_MOUNTED;
+ dout("mount success\n");
+ err = 0;
+
+out:
+ mutex_unlock(&client->mount_mutex);
+ return err;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+ struct ceph_client *client = data;
+ int ret;
+
+ dout("set_super %p data %p\n", s, data);
+
+ s->s_flags = client->mount_args->sb_flags;
+ s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
+
+ s->s_fs_info = client;
+ client->sb = s;
+
+ s->s_op = &ceph_super_ops;
+ s->s_export_op = &ceph_export_ops;
+
+ s->s_time_gran = 1000; /* 1000 ns == 1 us */
+
+ ret = set_anon_super(s, NULL); /* what is that second arg for? */
+ if (ret != 0)
+ goto fail;
+
+ return ret;
+
+fail:
+ s->s_fs_info = NULL;
+ client->sb = NULL;
+ return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+ struct ceph_client *new = data;
+ struct ceph_mount_args *args = new->mount_args;
+ struct ceph_client *other = ceph_sb_to_client(sb);
+ int i;
+
+ dout("ceph_compare_super %p\n", sb);
+ if (args->flags & CEPH_OPT_FSID) {
+ if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+ dout("fsid doesn't match\n");
+ return 0;
+ }
+ } else {
+ /* do we share (a) monitor? */
+ for (i = 0; i < new->monc.monmap->num_mon; i++)
+ if (ceph_monmap_contains(other->monc.monmap,
+ &new->monc.monmap->mon_inst[i].addr))
+ break;
+ if (i == new->monc.monmap->num_mon) {
+ dout("mon ip not part of monmap\n");
+ return 0;
+ }
+ dout("mon ip matches existing sb %p\n", sb);
+ }
+ if (args->sb_flags != other->mount_args->sb_flags) {
+ dout("flags differ\n");
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+{
+ int err;
+
+ sb->s_bdi = &client->backing_dev_info;
+
+ /* set ra_pages based on rsize mount option? */
+ if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+ client->backing_dev_info.ra_pages =
+ (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+ >> PAGE_SHIFT;
+ err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+ return err;
+}
+
+static int ceph_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data,
+ struct vfsmount *mnt)
+{
+ struct super_block *sb;
+ struct ceph_client *client;
+ int err;
+ int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+ const char *path = NULL;
+ struct ceph_mount_args *args;
+
+ dout("ceph_get_sb\n");
+ args = parse_mount_args(flags, data, dev_name, &path);
+ if (IS_ERR(args)) {
+ err = PTR_ERR(args);
+ goto out_final;
+ }
+
+ /* create client (which we may/may not use) */
+ client = ceph_create_client(args);
+ if (IS_ERR(client)) {
+ err = PTR_ERR(client);
+ goto out_final;
+ }
+
+ if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+ compare_super = NULL;
+ sb = sget(fs_type, compare_super, ceph_set_super, client);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ goto out;
+ }
+
+ if (ceph_client(sb) != client) {
+ ceph_destroy_client(client);
+ client = ceph_client(sb);
+ dout("get_sb got existing client %p\n", client);
+ } else {
+ dout("get_sb using new client %p\n", client);
+ err = ceph_register_bdi(sb, client);
+ if (err < 0)
+ goto out_splat;
+ }
+
+ err = ceph_mount(client, mnt, path);
+ if (err < 0)
+ goto out_splat;
+ dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+ mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+ return 0;
+
+out_splat:
+ ceph_mdsc_close_sessions(&client->mdsc);
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+ goto out_final;
+
+out:
+ ceph_destroy_client(client);
+out_final:
+ dout("ceph_get_sb fail %d\n", err);
+ return err;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+ struct ceph_client *client = ceph_sb_to_client(s);
+ dout("kill_sb %p\n", s);
+ ceph_mdsc_pre_umount(&client->mdsc);
+ kill_anon_super(s); /* will call put_super after sb is r/o */
+ bdi_unregister(&client->backing_dev_info);
+ bdi_destroy(&client->backing_dev_info);
+ ceph_destroy_client(client);
+}
+
+static struct file_system_type ceph_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ceph",
+ .get_sb = ceph_get_sb,
+ .kill_sb = ceph_kill_sb,
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+ int ret = 0;
+
+ ret = ceph_debugfs_init();
+ if (ret < 0)
+ goto out;
+
+ ret = ceph_msgr_init();
+ if (ret < 0)
+ goto out_debugfs;
+
+ ret = init_caches();
+ if (ret)
+ goto out_msgr;
+
+ ceph_caps_init();
+
+ ret = register_filesystem(&ceph_fs_type);
+ if (ret)
+ goto out_icache;
+
+ pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+ CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+ CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+ return 0;
+
+out_icache:
+ destroy_caches();
+out_msgr:
+ ceph_msgr_exit();
+out_debugfs:
+ ceph_debugfs_cleanup();
+out:
+ return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+ dout("exit_ceph\n");
+ unregister_filesystem(&ceph_fs_type);
+ ceph_caps_finalize();
+ destroy_caches();
+ ceph_msgr_exit();
+ ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..de5e32414978
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,895 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
+#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID (1<<0)
+#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
+
+#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
+
+#define ceph_set_opt(client, opt) \
+ (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+ (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+
+
+struct ceph_mount_args {
+ int sb_flags;
+ int num_mon;
+ struct ceph_entity_addr *mon_addr;
+ int flags;
+ int mount_timeout;
+ int caps_wanted_delay_min, caps_wanted_delay_max;
+ struct ceph_fsid fsid;
+ struct ceph_entity_addr my_addr;
+ int wsize;
+ int rsize; /* max readahead */
+ int max_readdir; /* max readdir size */
+ int osd_timeout;
+ char *snapdir_name; /* default ".snap" */
+ char *name;
+ char *secret;
+ int cap_release_safety;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
+#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
+
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
+
+/* mount state */
+enum {
+ CEPH_MOUNT_MOUNTING,
+ CEPH_MOUNT_MOUNTED,
+ CEPH_MOUNT_UNMOUNTING,
+ CEPH_MOUNT_UNMOUNTED,
+ CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+ BUG_ON(time_after(b, a));
+ return (long)a - (long)b;
+}
+
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+ struct ceph_fsid fsid;
+ bool have_fsid;
+
+ struct mutex mount_mutex; /* serialize mount attempts */
+ struct ceph_mount_args *mount_args;
+
+ struct super_block *sb;
+
+ unsigned long mount_state;
+ wait_queue_head_t mount_wq;
+
+ int mount_err;
+
+ struct ceph_messenger *msgr; /* messenger instance */
+ struct ceph_mon_client monc;
+ struct ceph_mds_client mdsc;
+ struct ceph_osd_client osdc;
+
+ /* writeback */
+ mempool_t *wb_pagevec_pool;
+ struct workqueue_struct *wb_wq;
+ struct workqueue_struct *pg_inv_wq;
+ struct workqueue_struct *trunc_wq;
+
+ struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_monmap;
+ struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+ struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+#endif
+};
+
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+
+/*
+ * File i/o capability. This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data. For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+ struct ceph_inode_info *ci;
+ struct rb_node ci_node; /* per-ci cap tree */
+ struct ceph_mds_session *session;
+ struct list_head session_caps; /* per-session caplist */
+ int mds;
+ u64 cap_id; /* unique cap id (mds provided) */
+ int issued; /* latest, from the mds */
+ int implemented; /* implemented superset of issued (for revocation) */
+ int mds_wanted;
+ u32 seq, issue_seq, mseq;
+ u32 cap_gen; /* active/stale cycle */
+ unsigned long last_used;
+ struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds. When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+ atomic_t nref;
+ struct ceph_inode_info *ci;
+ struct list_head ci_item, flushing_item;
+
+ u64 follows, flush_tid;
+ int issued, dirty;
+ struct ceph_snap_context *context;
+
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ void *xattr_blob;
+ int xattr_len;
+ u64 xattr_version;
+
+ u64 size;
+ struct timespec mtime, atime, ctime;
+ u64 time_warp_seq;
+ int writing; /* a sync write is still in progress */
+ int dirty_pages; /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+ if (atomic_dec_and_test(&capsnap->nref))
+ kfree(capsnap);
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers. It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info. That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+ struct rb_node node;
+
+ /* fragtree state */
+ u32 frag;
+ int split_by; /* i.e. 2^(split_by) children */
+
+ /* delegation and replication info */
+ int mds; /* -1 if same authority as parent */
+ int ndist; /* >0 if replicated */
+ int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+ struct rb_node node;
+
+ const char *name;
+ int name_len;
+ const char *val;
+ int val_len;
+ int dirty;
+
+ int should_free_name;
+ int should_free_val;
+};
+
+struct ceph_inode_xattrs_info {
+ /*
+ * (still encoded) xattr blob. we avoid the overhead of parsing
+ * this until someone actually calls getxattr, etc.
+ *
+ * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+ * NULL means we don't know.
+ */
+ struct ceph_buffer *blob, *prealloc_blob;
+
+ struct rb_root index;
+ bool dirty;
+ int count;
+ int names_size;
+ int vals_size;
+ u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
+#define CEPH_I_NODELAY 4 /* do not delay cap release */
+#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
+
+struct ceph_inode_info {
+ struct ceph_vino i_vino; /* ceph ino + snap */
+
+ u64 i_version;
+ u32 i_time_warp_seq;
+
+ unsigned i_ceph_flags;
+ unsigned long i_release_count;
+
+ struct ceph_file_layout i_layout;
+ char *i_symlink;
+
+ /* for dirs */
+ struct timespec i_rctime;
+ u64 i_rbytes, i_rfiles, i_rsubdirs;
+ u64 i_files, i_subdirs;
+ u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
+
+ struct rb_root i_fragtree;
+ struct mutex i_fragtree_mutex;
+
+ struct ceph_inode_xattrs_info i_xattrs;
+
+ /* capabilities. protected _both_ by i_lock and cap->session's
+ * s_mutex. */
+ struct rb_root i_caps; /* cap list */
+ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
+ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
+ struct list_head i_dirty_item, i_flushing_item;
+ u64 i_cap_flush_seq;
+ /* we need to track cap writeback on a per-cap-bit basis, to allow
+ * overlapping, pipelined cap flushes to the mds. we can probably
+ * reduce the tid to 8 bits if we're concerned about inode size. */
+ u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+ wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
+ unsigned long i_hold_caps_min; /* jiffies */
+ unsigned long i_hold_caps_max; /* jiffies */
+ struct list_head i_cap_delay_list; /* for delayed cap release to mds */
+ int i_cap_exporting_mds; /* to handle cap migration between */
+ unsigned i_cap_exporting_mseq; /* mds's. */
+ unsigned i_cap_exporting_issued;
+ struct ceph_cap_reservation i_cap_migration_resv;
+ struct list_head i_cap_snaps; /* snapped state pending flush to mds */
+ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
+ unsigned i_snap_caps; /* cap bits for snapped files */
+
+ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+
+ u32 i_truncate_seq; /* last truncate to smaller size */
+ u64 i_truncate_size; /* and the size we last truncated down to */
+ int i_truncate_pending; /* still need to call vmtruncate */
+
+ u64 i_max_size; /* max file size authorized by mds */
+ u64 i_reported_size; /* (max_)size reported to or requested of mds */
+ u64 i_wanted_max_size; /* offset we'd like to write too */
+ u64 i_requested_max_size; /* max_size we've requested */
+
+ /* held references to caps */
+ int i_pin_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref;
+ int i_wrbuffer_ref, i_wrbuffer_ref_head;
+ u32 i_shared_gen; /* increment each time we get FILE_SHARED */
+ u32 i_rdcache_gen; /* we increment this each time we get
+ FILE_CACHE. If it's non-zero, we
+ _may_ have cached pages. */
+ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+ struct list_head i_unsafe_writes; /* uncommitted sync writes */
+ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ spinlock_t i_unsafe_lock;
+
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ int i_snap_realm_counter; /* snap realm (if caps) */
+ struct list_head i_snap_realm_item;
+ struct list_head i_snap_flush_item;
+
+ struct work_struct i_wb_work; /* writeback work */
+ struct work_struct i_pg_inv_work; /* page invalidation work */
+
+ struct work_struct i_vmtruncate_work;
+
+ struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+ return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&inode->i_lock);
+ ci->i_ceph_flags &= ~mask;
+ spin_unlock(&inode->i_lock);
+}
+
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&inode->i_lock);
+ ci->i_ceph_flags |= mask;
+ spin_unlock(&inode->i_lock);
+}
+
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool r;
+
+ smp_mb();
+ r = (ci->i_ceph_flags & mask) == mask;
+ return r;
+}
+
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+ u32 f);
+
+/*
+ * choose fragment for value @v. copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag,
+ int *found);
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+ struct ceph_mds_session *lease_session;
+ u32 lease_gen, lease_shared_gen;
+ u32 lease_seq;
+ unsigned long lease_renew_after, lease_renew_from;
+ struct list_head lru;
+ struct dentry *dentry;
+ u64 time;
+ u64 offset;
+};
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+ return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+ return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+ ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+ ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+ if (!ino)
+ ino = 1;
+#endif
+ return ino;
+}
+
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+ ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ return 0;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+ struct ceph_vino *pvino = (struct ceph_vino *)data;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return ci->i_vino.ino == pvino->ino &&
+ ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+ struct ceph_vino vino)
+{
+ ino_t t = ceph_vino_to_ino(vino);
+ return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+ struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+ int issued;
+ spin_lock(&ci->vfs_inode.i_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->vfs_inode.i_lock);
+ return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+ int touch)
+{
+ int r;
+ spin_lock(&ci->vfs_inode.i_lock);
+ r = __ceph_caps_issued_mask(ci, mask, touch);
+ spin_unlock(&ci->vfs_inode.i_lock);
+ return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+ return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
+ return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+ int *total, int *avail, int *used,
+ int *reserved);
+
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+ return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+ return (struct ceph_client *)sb->s_fs_info;
+}
+
+static inline int ceph_queue_writeback(struct inode *inode)
+{
+ return queue_work(ceph_inode_to_client(inode)->wb_wq,
+ &ceph_inode(inode)->i_wb_work);
+}
+
+static inline int ceph_queue_page_invalidation(struct inode *inode)
+{
+ return queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+ &ceph_inode(inode)->i_pg_inv_work);
+}
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+ int fmode; /* initialized on open */
+
+ /* readdir: position within the dir */
+ u32 frag;
+ struct ceph_mds_request *last_readdir;
+ int at_end;
+
+ /* readdir: position within a frag */
+ unsigned offset; /* offset of last chunk, adjusted for . and .. */
+ u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ char *last_name; /* last entry in previous chunk */
+ struct dentry *dentry; /* next dentry (for dcache readdir) */
+ unsigned long dir_release_count;
+
+ /* used for -o dirstat read() on directory thing */
+ char *dir_info;
+ int dir_info_len;
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data. It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+ atomic_t nref;
+ u64 seq;
+ int num_snaps;
+ u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+ /*
+ printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+ atomic_read(&sc->nref)+1);
+ */
+ if (sc)
+ atomic_inc(&sc->nref);
+ return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+ if (!sc)
+ return;
+ /*
+ printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+ atomic_read(&sc->nref)-1);
+ */
+ if (atomic_dec_and_test(&sc->nref)) {
+ /*printk(" deleting snap_context %p\n", sc);*/
+ kfree(sc);
+ }
+}
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it. The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+ u64 ino;
+ atomic_t nref;
+ u64 created, seq;
+ u64 parent_ino;
+ u64 parent_since; /* snapid when our current parent became so */
+
+ u64 *prior_parent_snaps; /* snaps inherited from any parents we */
+ int num_prior_parent_snaps; /* had prior to parent_since */
+ u64 *snaps; /* snaps specific to this realm */
+ int num_snaps;
+
+ struct ceph_snap_realm *parent;
+ struct list_head children; /* list of child realms */
+ struct list_head child_item;
+
+ struct list_head empty_item; /* if i have ref==0 */
+
+ /* the current set of snaps for this realm */
+ struct ceph_snap_context *cached_context;
+
+ struct list_head inodes_with_caps;
+ spinlock_t inodes_with_caps_lock;
+};
+
+
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+ return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+ (off >> PAGE_CACHE_SHIFT);
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+ void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_snap_context *snapc);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+ return !list_empty(&ci->i_cap_snaps) &&
+ list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+ ci_item)->writing;
+}
+
+
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+ "%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+ (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
+ (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
+ (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+ struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+ u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+ u64 time_warp_seq, struct timespec *ctime,
+ struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+ struct ceph_mds_request *req,
+ struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+ struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void ceph_inode_writeback(struct work_struct *work);
+extern void ceph_vmtruncate_work(struct work_struct *work);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void __ceph_queue_vmtruncate(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+ size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap,
+ struct ceph_cap_reservation *ctx);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+ struct inode *inode = &cap->ci->vfs_inode;
+ spin_lock(&inode->i_lock);
+ __ceph_remove_cap(cap, NULL);
+ spin_unlock(&inode->i_lock);
+}
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, int unused);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+ int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+ int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+ int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+ ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd, int mode,
+ int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+ ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+ struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+ if (dentry && dentry->d_parent)
+ return dentry->d_parent->d_inode;
+
+ return NULL;
+}
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+ u64 ino;
+ u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+ int count;
+};
+
+
+#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..04769a3ab832
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,842 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+
+#include <linux/xattr.h>
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+ return !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+ !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+ bool readonly;
+ char *name;
+ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+ size_t size);
+};
+
+/* directories */
+
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+ (long)ci->i_rctime.tv_nsec);
+}
+
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+ { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+ { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+ { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+ { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+ { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+ { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+ { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+ { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+ { true, NULL, NULL }
+};
+
+/* files */
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ int ret;
+
+ ret = snprintf(val, size,
+ "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+ (unsigned long long)ceph_file_layout_su(ci->i_layout),
+ (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ if (ceph_file_layout_pg_preferred(ci->i_layout))
+ ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+ (unsigned long long)ceph_file_layout_pg_preferred(
+ ci->i_layout));
+ return ret;
+}
+
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+ { true, "user.ceph.layout", ceph_vxattrcb_layout},
+ { NULL, NULL }
+};
+
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode))
+ return ceph_dir_vxattrs;
+ else if (S_ISREG(inode->i_mode))
+ return ceph_file_vxattrs;
+ return NULL;
+}
+
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+ const char *name)
+{
+ do {
+ if (strcmp(vxattr->name, name) == 0)
+ return vxattr;
+ vxattr++;
+ } while (vxattr->name);
+ return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+ const char *name, int name_len,
+ const char *val, int val_len,
+ int dirty,
+ int should_free_name, int should_free_val,
+ struct ceph_inode_xattr **newxattr)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int c;
+ int new = 0;
+
+ p = &ci->i_xattrs.index.rb_node;
+ while (*p) {
+ parent = *p;
+ xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+ c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else {
+ if (name_len == xattr->name_len)
+ break;
+ else if (name_len < xattr->name_len)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ xattr = NULL;
+ }
+
+ if (!xattr) {
+ new = 1;
+ xattr = *newxattr;
+ xattr->name = name;
+ xattr->name_len = name_len;
+ xattr->should_free_name = should_free_name;
+
+ ci->i_xattrs.count++;
+ dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+ } else {
+ kfree(*newxattr);
+ *newxattr = NULL;
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ if (should_free_name) {
+ kfree((void *)name);
+ name = xattr->name;
+ }
+ ci->i_xattrs.names_size -= xattr->name_len;
+ ci->i_xattrs.vals_size -= xattr->val_len;
+ }
+ if (!xattr) {
+ pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
+ &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
+ xattr->val);
+ return -ENOMEM;
+ }
+ ci->i_xattrs.names_size += name_len;
+ ci->i_xattrs.vals_size += val_len;
+ if (val)
+ xattr->val = val;
+ else
+ xattr->val = "";
+
+ xattr->val_len = val_len;
+ xattr->dirty = dirty;
+ xattr->should_free_val = (val && should_free_val);
+
+ if (new) {
+ rb_link_node(&xattr->node, parent, p);
+ rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+ dout("__set_xattr_val p=%p\n", p);
+ }
+
+ dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+ ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+ return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+ const char *name)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int c;
+
+ p = &ci->i_xattrs.index.rb_node;
+ while (*p) {
+ parent = *p;
+ xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+ c = strncmp(name, xattr->name, xattr->name_len);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else {
+ dout("__get_xattr %s: found %.*s\n", name,
+ xattr->val_len, xattr->val);
+ return xattr;
+ }
+ }
+
+ dout("__get_xattr %s: not found\n", name);
+
+ return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+ BUG_ON(!xattr);
+
+ if (xattr->should_free_name)
+ kfree((void *)xattr->name);
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr)
+{
+ if (!xattr)
+ return -EOPNOTSUPP;
+
+ rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+ if (xattr->should_free_name)
+ kfree((void *)xattr->name);
+ if (xattr->should_free_val)
+ kfree((void *)xattr->val);
+
+ ci->i_xattrs.names_size -= xattr->name_len;
+ ci->i_xattrs.vals_size -= xattr->val_len;
+ ci->i_xattrs.count--;
+ kfree(xattr);
+
+ return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+ const char *name)
+{
+ struct rb_node **p;
+ struct ceph_inode_xattr *xattr;
+ int err;
+
+ p = &ci->i_xattrs.index.rb_node;
+ xattr = __get_xattr(ci, name);
+ err = __remove_xattr(ci, xattr);
+ return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+ char *dest)
+{
+ struct rb_node *p;
+ struct ceph_inode_xattr *xattr = NULL;
+
+ p = rb_first(&ci->i_xattrs.index);
+ dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+ memcpy(dest, xattr->name, xattr->name_len);
+ dest[xattr->name_len] = '\0';
+
+ dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+ xattr->name_len, ci->i_xattrs.names_size);
+
+ dest += xattr->name_len + 1;
+ p = rb_next(p);
+ }
+
+ return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+ struct rb_node *p, *tmp;
+ struct ceph_inode_xattr *xattr = NULL;
+
+ p = rb_first(&ci->i_xattrs.index);
+
+ dout("__ceph_destroy_xattrs p=%p\n", p);
+
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+ tmp = p;
+ p = rb_next(tmp);
+ dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+ xattr->name_len, xattr->name);
+ rb_erase(tmp, &ci->i_xattrs.index);
+
+ __free_xattr(xattr);
+ }
+
+ ci->i_xattrs.names_size = 0;
+ ci->i_xattrs.vals_size = 0;
+ ci->i_xattrs.index_version = 0;
+ ci->i_xattrs.count = 0;
+ ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+{
+ u32 namelen;
+ u32 numattr = 0;
+ void *p, *end;
+ u32 len;
+ const char *name, *val;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int xattr_version;
+ struct ceph_inode_xattr **xattrs = NULL;
+ int err = 0;
+ int i;
+
+ dout("__build_xattrs() len=%d\n",
+ ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+ if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+ return 0; /* already built */
+
+ __ceph_destroy_xattrs(ci);
+
+start:
+ /* updated internal xattr rb tree */
+ if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+ p = ci->i_xattrs.blob->vec.iov_base;
+ end = p + ci->i_xattrs.blob->vec.iov_len;
+ ceph_decode_32_safe(&p, end, numattr, bad);
+ xattr_version = ci->i_xattrs.version;
+ spin_unlock(&inode->i_lock);
+
+ xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+ GFP_NOFS);
+ err = -ENOMEM;
+ if (!xattrs)
+ goto bad_lock;
+ memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+ for (i = 0; i < numattr; i++) {
+ xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+ GFP_NOFS);
+ if (!xattrs[i])
+ goto bad_lock;
+ }
+
+ spin_lock(&inode->i_lock);
+ if (ci->i_xattrs.version != xattr_version) {
+ /* lost a race, retry */
+ for (i = 0; i < numattr; i++)
+ kfree(xattrs[i]);
+ kfree(xattrs);
+ goto start;
+ }
+ err = -EIO;
+ while (numattr--) {
+ ceph_decode_32_safe(&p, end, len, bad);
+ namelen = len;
+ name = p;
+ p += len;
+ ceph_decode_32_safe(&p, end, len, bad);
+ val = p;
+ p += len;
+
+ err = __set_xattr(ci, name, namelen, val, len,
+ 0, 0, 0, &xattrs[numattr]);
+
+ if (err < 0)
+ goto bad;
+ }
+ kfree(xattrs);
+ }
+ ci->i_xattrs.index_version = ci->i_xattrs.version;
+ ci->i_xattrs.dirty = false;
+
+ return err;
+bad_lock:
+ spin_lock(&inode->i_lock);
+bad:
+ if (xattrs) {
+ for (i = 0; i < numattr; i++)
+ kfree(xattrs[i]);
+ kfree(xattrs);
+ }
+ ci->i_xattrs.names_size = 0;
+ return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+ int val_size)
+{
+ /*
+ * 4 bytes for the length, and additional 4 bytes per each xattr name,
+ * 4 bytes per each value
+ */
+ int size = 4 + ci->i_xattrs.count*(4 + 4) +
+ ci->i_xattrs.names_size +
+ ci->i_xattrs.vals_size;
+ dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+ ci->i_xattrs.count, ci->i_xattrs.names_size,
+ ci->i_xattrs.vals_size);
+
+ if (name_size)
+ size += 4 + 4 + name_size + val_size;
+
+ return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+ struct rb_node *p;
+ struct ceph_inode_xattr *xattr = NULL;
+ void *dest;
+
+ dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ if (ci->i_xattrs.dirty) {
+ int need = __get_required_blob_size(ci, 0, 0);
+
+ BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+ p = rb_first(&ci->i_xattrs.index);
+ dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+ ceph_encode_32(&dest, ci->i_xattrs.count);
+ while (p) {
+ xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+ ceph_encode_32(&dest, xattr->name_len);
+ memcpy(dest, xattr->name, xattr->name_len);
+ dest += xattr->name_len;
+ ceph_encode_32(&dest, xattr->val_len);
+ memcpy(dest, xattr->val, xattr->val_len);
+ dest += xattr->val_len;
+
+ p = rb_next(p);
+ }
+
+ /* adjust buffer len; it may be larger than we need */
+ ci->i_xattrs.prealloc_blob->vec.iov_len =
+ dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+ ci->i_xattrs.prealloc_blob = NULL;
+ ci->i_xattrs.dirty = false;
+ }
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ int err;
+ struct ceph_inode_xattr *xattr;
+ struct ceph_vxattr_cb *vxattr = NULL;
+
+ if (!ceph_is_valid_xattr(name))
+ return -ENODATA;
+
+ /* let's see if a virtual xattr was requested */
+ if (vxattrs)
+ vxattr = ceph_match_vxattr(vxattrs, name);
+
+ spin_lock(&inode->i_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+ (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+ goto get_xattr;
+ } else {
+ spin_unlock(&inode->i_lock);
+ /* get xattrs from mds (if we don't already have them) */
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ if (err)
+ return err;
+ }
+
+ spin_lock(&inode->i_lock);
+
+ if (vxattr && vxattr->readonly) {
+ err = vxattr->getxattr_cb(ci, value, size);
+ goto out;
+ }
+
+ err = __build_xattrs(inode);
+ if (err < 0)
+ goto out;
+
+get_xattr:
+ err = -ENODATA; /* == ENOATTR */
+ xattr = __get_xattr(ci, name);
+ if (!xattr) {
+ if (vxattr)
+ err = vxattr->getxattr_cb(ci, value, size);
+ goto out;
+ }
+
+ err = -ERANGE;
+ if (size && size < xattr->val_len)
+ goto out;
+
+ err = xattr->val_len;
+ if (size == 0)
+ goto out;
+
+ memcpy(value, xattr->val, xattr->val_len);
+
+out:
+ spin_unlock(&inode->i_lock);
+ return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ u32 vir_namelen = 0;
+ u32 namelen;
+ int err;
+ u32 len;
+ int i;
+
+ spin_lock(&inode->i_lock);
+ dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+ (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+ goto list_xattr;
+ } else {
+ spin_unlock(&inode->i_lock);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ if (err)
+ return err;
+ }
+
+ spin_lock(&inode->i_lock);
+
+ err = __build_xattrs(inode);
+ if (err < 0)
+ goto out;
+
+list_xattr:
+ vir_namelen = 0;
+ /* include virtual dir xattrs */
+ if (vxattrs)
+ for (i = 0; vxattrs[i].name; i++)
+ vir_namelen += strlen(vxattrs[i].name) + 1;
+ /* adding 1 byte per each variable due to the null termination */
+ namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+ err = -ERANGE;
+ if (size && namelen > size)
+ goto out;
+
+ err = namelen;
+ if (size == 0)
+ goto out;
+
+ names = __copy_xattr_names(ci, names);
+
+ /* virtual xattr names, too */
+ if (vxattrs)
+ for (i = 0; vxattrs[i].name; i++) {
+ len = sprintf(names, "%s", vxattrs[i].name);
+ names += len + 1;
+ }
+
+out:
+ spin_unlock(&inode->i_lock);
+ return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+ const char *value, size_t size, int flags)
+{
+ struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct inode *parent_inode = dentry->d_parent->d_inode;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ int err;
+ int i, nr_pages;
+ struct page **pages = NULL;
+ void *kaddr;
+
+ /* copy value into some pages */
+ nr_pages = calc_pages_for(0, size);
+ if (nr_pages) {
+ pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ err = -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ nr_pages = i;
+ goto out;
+ }
+ kaddr = kmap(pages[i]);
+ memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+ min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+ }
+ }
+
+ dout("setxattr value=%.*s\n", (int)size, value);
+
+ /* do request */
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+ USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_inode = igrab(inode);
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+ req->r_num_caps = 1;
+ req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+
+ req->r_pages = pages;
+ req->r_num_pages = nr_pages;
+ req->r_data_len = size;
+
+ dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ ceph_mdsc_put_request(req);
+ dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+ if (pages) {
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+ }
+ return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ int err;
+ int name_len = strlen(name);
+ int val_len = size;
+ char *newname = NULL;
+ char *newval = NULL;
+ struct ceph_inode_xattr *xattr = NULL;
+ int issued;
+ int required_blob_size;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+
+ if (vxattrs) {
+ struct ceph_vxattr_cb *vxattr =
+ ceph_match_vxattr(vxattrs, name);
+ if (vxattr && vxattr->readonly)
+ return -EOPNOTSUPP;
+ }
+
+ /* preallocate memory for xattr name, value, index node */
+ err = -ENOMEM;
+ newname = kmalloc(name_len + 1, GFP_NOFS);
+ if (!newname)
+ goto out;
+ memcpy(newname, name, name_len + 1);
+
+ if (val_len) {
+ newval = kmalloc(val_len + 1, GFP_NOFS);
+ if (!newval)
+ goto out;
+ memcpy(newval, value, val_len);
+ newval[val_len] = '\0';
+ }
+
+ xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+ if (!xattr)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+retry:
+ issued = __ceph_caps_issued(ci, NULL);
+ if (!(issued & CEPH_CAP_XATTR_EXCL))
+ goto do_sync;
+ __build_xattrs(inode);
+
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+ if (!ci->i_xattrs.prealloc_blob ||
+ required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+ struct ceph_buffer *blob = NULL;
+
+ spin_unlock(&inode->i_lock);
+ dout(" preaallocating new blob size=%d\n", required_blob_size);
+ blob = ceph_buffer_new_alloc(required_blob_size, GFP_NOFS);
+ if (!blob)
+ goto out;
+ spin_lock(&inode->i_lock);
+ ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ci->i_xattrs.prealloc_blob = blob;
+ goto retry;
+ }
+
+ dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+ err = __set_xattr(ci, newname, name_len, newval,
+ val_len, 1, 1, 1, &xattr);
+ __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ ci->i_xattrs.dirty = true;
+ inode->i_ctime = CURRENT_TIME;
+ spin_unlock(&inode->i_lock);
+
+ return err;
+
+do_sync:
+ spin_unlock(&inode->i_lock);
+ err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+ kfree(newname);
+ kfree(newval);
+ kfree(xattr);
+ return err;
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+ struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct inode *inode = dentry->d_inode;
+ struct inode *parent_inode = dentry->d_parent->d_inode;
+ struct ceph_mds_request *req;
+ int err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+ USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+ req->r_num_caps = 1;
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+ int issued;
+ int err;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+
+ if (vxattrs) {
+ struct ceph_vxattr_cb *vxattr =
+ ceph_match_vxattr(vxattrs, name);
+ if (vxattr && vxattr->readonly)
+ return -EOPNOTSUPP;
+ }
+
+ spin_lock(&inode->i_lock);
+ __build_xattrs(inode);
+ issued = __ceph_caps_issued(ci, NULL);
+ dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+ if (!(issued & CEPH_CAP_XATTR_EXCL))
+ goto do_sync;
+
+ err = __remove_xattr_by_name(ceph_inode(inode), name);
+ __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ ci->i_xattrs.dirty = true;
+ inode->i_ctime = CURRENT_TIME;
+
+ spin_unlock(&inode->i_lock);
+
+ return err;
+do_sync:
+ spin_unlock(&inode->i_lock);
+ err = ceph_send_removexattr(dentry, name);
+ return err;
+}
+
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
* - no readahead or I/O queue unplugging required
*/
struct backing_dev_info directly_mappable_cdev_bdi = {
+ .name = "char",
.capabilities = (
#ifdef CONFIG_MMU
/* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
}
/**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
* @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
* @name: name of this range of devices
* @fops: file operations associated with this devices
*
@@ -254,19 +257,16 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
* /dev. It only helps to keep track of the different owners of devices. If
* your module name has only one type of devices it's ok to use e.g. the name
* of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
*/
-int register_chrdev(unsigned int major, const char *name,
- const struct file_operations *fops)
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name,
+ const struct file_operations *fops)
{
struct char_device_struct *cd;
struct cdev *cdev;
- char *s;
int err = -ENOMEM;
- cd = __register_chrdev_region(major, 0, 256, name);
+ cd = __register_chrdev_region(major, baseminor, count, name);
if (IS_ERR(cd))
return PTR_ERR(cd);
@@ -277,10 +277,8 @@ int register_chrdev(unsigned int major, const char *name,
cdev->owner = fops->owner;
cdev->ops = fops;
kobject_set_name(&cdev->kobj, "%s", name);
- for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
- *s = '!';
- err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+ err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
if (err)
goto out;
@@ -290,7 +288,7 @@ int register_chrdev(unsigned int major, const char *name,
out:
kobject_put(&cdev->kobj);
out2:
- kfree(__unregister_chrdev_region(cd->major, 0, 256));
+ kfree(__unregister_chrdev_region(cd->major, baseminor, count));
return err;
}
@@ -316,10 +314,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
}
}
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count. This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name)
{
struct char_device_struct *cd;
- cd = __unregister_chrdev_region(major, 0, 256);
+
+ cd = __unregister_chrdev_region(major, baseminor, count);
if (cd && cd->cdev)
cdev_del(cd->cdev);
kfree(cd);
@@ -568,6 +579,6 @@ EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..094ea65afc85 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,9 +1,21 @@
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works).
+
Version 1.60
-------------
Fix memory leak in reconnect. Fix oops in DFS mount error path.
Set s_maxbytes to smaller (the max that vfs can handle) so that
sendfile will now work over cifs mounts again. Add noforcegid
-and noforceuid mount parameters.
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
Version 1.59
------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 6994a0f54f02..80f352596807 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,7 @@ config CIFS
tristate "CIFS support (advanced network filesystem, SMBFS successor)"
depends on INET
select NLS
+ select SLOW_WORK
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..fea9e898c4ba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
if (rc != 0) {
cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
- __func__, *devname, rc));;
+ __func__, *devname, rc));
goto compose_mount_options_err;
}
/* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -385,7 +385,7 @@ out_err:
goto out;
}
-struct inode_operations cifs_dfs_referral_inode_operations = {
+const struct inode_operations cifs_dfs_referral_inode_operations = {
.follow_link = cifs_dfs_follow_mountpoint,
};
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
if (server->addr.sockAddr.sin_family == AF_INET)
sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
else if (server->addr.sockAddr.sin_family == AF_INET6)
- sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+ sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
else
goto out;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
return get_cifs_acl_by_path(cifs_sb, path, pacllen);
pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return pntsd;
}
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return rc;
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
compare with the NTLM example */
hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+ kfree(pctxt);
return rc;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..9a5e4f5f3122 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
#ifdef CONFIG_CIFS_QUOTA
-static struct quotactl_ops cifs_quotactl_ops;
+static const struct quotactl_ops cifs_quotactl_ops;
#endif /* QUOTA */
int cifsFYI = 0;
@@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0;
unsigned int extended_security = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
-extern struct task_struct *oplockThread; /* remove sparse warning */
-struct task_struct *oplockThread = NULL;
-/* extern struct task_struct * dnotifyThread; remove sparse warning */
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, int, 0);
@@ -185,8 +182,7 @@ out_mount_failed:
cifs_sb->mountdata = NULL;
}
#endif
- if (cifs_sb->local_nls)
- unload_nls(cifs_sb->local_nls);
+ unload_nls(cifs_sb->local_nls);
kfree(cifs_sb);
}
return rc;
@@ -361,13 +357,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
static int
cifs_show_options(struct seq_file *s, struct vfsmount *m)
{
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *tcon;
-
- cifs_sb = CIFS_SB(m->mnt_sb);
- tcon = cifs_sb->tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+ struct cifsTconInfo *tcon = cifs_sb->tcon;
- seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+ seq_printf(s, ",unc=%s", tcon->treeName);
if (tcon->ses->userName)
seq_printf(s, ",username=%s", tcon->ses->userName);
if (tcon->ses->domainName)
@@ -520,7 +513,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
return rc;
}
-static struct quotactl_ops cifs_quotactl_ops = {
+static const struct quotactl_ops cifs_quotactl_ops = {
.set_xquota = cifs_xquota_set,
.get_xquota = cifs_xquota_get,
.set_xstate = cifs_xstate_set,
@@ -976,89 +969,12 @@ cifs_destroy_mids(void)
kmem_cache_destroy(cifs_oplock_cachep);
}
-static int cifs_oplock_thread(void *dummyarg)
-{
- struct oplock_q_entry *oplock_item;
- struct cifsTconInfo *pTcon;
- struct inode *inode;
- __u16 netfid;
- int rc, waitrc = 0;
-
- set_freezable();
- do {
- if (try_to_freeze())
- continue;
-
- spin_lock(&GlobalMid_Lock);
- if (list_empty(&GlobalOplock_Q)) {
- spin_unlock(&GlobalMid_Lock);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(39*HZ);
- } else {
- oplock_item = list_entry(GlobalOplock_Q.next,
- struct oplock_q_entry, qhead);
- cFYI(1, ("found oplock item to write out"));
- pTcon = oplock_item->tcon;
- inode = oplock_item->pinode;
- netfid = oplock_item->netfid;
- spin_unlock(&GlobalMid_Lock);
- DeleteOplockQEntry(oplock_item);
- /* can not grab inode sem here since it would
- deadlock when oplock received on delete
- since vfs_unlink holds the i_mutex across
- the call */
- /* mutex_lock(&inode->i_mutex);*/
- if (S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- if (CIFS_I(inode)->clientCanCacheAll == 0)
- break_lease(inode, FMODE_READ);
- else if (CIFS_I(inode)->clientCanCacheRead == 0)
- break_lease(inode, FMODE_WRITE);
-#endif
- rc = filemap_fdatawrite(inode->i_mapping);
- if (CIFS_I(inode)->clientCanCacheRead == 0) {
- waitrc = filemap_fdatawait(
- inode->i_mapping);
- invalidate_remote_inode(inode);
- }
- if (rc == 0)
- rc = waitrc;
- } else
- rc = 0;
- /* mutex_unlock(&inode->i_mutex);*/
- if (rc)
- CIFS_I(inode)->write_behind_rc = rc;
- cFYI(1, ("Oplock flush inode %p rc %d",
- inode, rc));
-
- /* releasing stale oplock after recent reconnect
- of smb session using a now incorrect file
- handle is not a data integrity issue but do
- not bother sending an oplock release if session
- to server still is disconnected since oplock
- already released by the server in that case */
- if (!pTcon->need_reconnect) {
- rc = CIFSSMBLock(0, pTcon, netfid,
- 0 /* len */ , 0 /* offset */, 0,
- 0, LOCKING_ANDX_OPLOCK_RELEASE,
- false /* wait flag */);
- cFYI(1, ("Oplock release rc = %d", rc));
- }
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1); /* yield in case q were corrupt */
- }
- } while (!kthread_should_stop());
-
- return 0;
-}
-
static int __init
init_cifs(void)
{
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
- INIT_LIST_HEAD(&GlobalOplock_Q);
#ifdef CONFIG_CIFS_EXPERIMENTAL
INIT_LIST_HEAD(&GlobalDnotifyReqList);
INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1121,16 +1037,13 @@ init_cifs(void)
if (rc)
goto out_unregister_key_type;
#endif
- oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
- if (IS_ERR(oplockThread)) {
- rc = PTR_ERR(oplockThread);
- cERROR(1, ("error %d create oplock thread", rc));
- goto out_unregister_dfs_key_type;
- }
+ rc = slow_work_register_user();
+ if (rc)
+ goto out_unregister_resolver_key;
return 0;
- out_unregister_dfs_key_type:
+ out_unregister_resolver_key:
#ifdef CONFIG_CIFS_DFS_UPCALL
unregister_key_type(&key_type_dns_resolver);
out_unregister_key_type:
@@ -1167,7 +1080,6 @@ exit_cifs(void)
cifs_destroy_inodecache();
cifs_destroy_mids();
cifs_destroy_request_bufs();
- kthread_stop(oplockThread);
}
MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
extern const struct inode_operations cifs_file_inode_ops;
extern const struct inode_operations cifs_symlink_inode_ops;
-extern struct inode_operations cifs_dfs_referral_inode_operations;
+extern const struct inode_operations cifs_dfs_referral_inode_operations;
/* Functions related to files and directories */
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.60"
+#define CIFS_VERSION "1.61"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..5d0fde18039c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
*/
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/slow-work.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
/*
@@ -346,16 +347,33 @@ struct cifsFileInfo {
/* lock scope id (0 if none) */
struct file *pfile; /* needed for writepage */
struct inode *pInode; /* needed for oplock break */
+ struct vfsmount *mnt;
struct mutex lock_mutex;
struct list_head llist; /* list of byte range locks we have. */
bool closePend:1; /* file is marked to close */
bool invalidHandle:1; /* file closed via session abend */
- bool messageMode:1; /* for pipes: message vs byte mode */
- atomic_t wrtPending; /* handle in use - defer close */
+ bool oplock_break_cancelled:1;
+ atomic_t count; /* reference count */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
+ struct slow_work oplock_break; /* slow_work job for oplock breaks */
};
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+ atomic_inc(&cifs_file->count);
+}
+
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+ if (atomic_dec_and_test(&cifs_file->count)) {
+ iput(cifs_file->pInode);
+ kfree(cifs_file);
+ }
+}
+
/*
* One of these for each file inode
*/
@@ -369,7 +387,6 @@ struct cifsInodeInfo {
unsigned long time; /* jiffies of last update/check of inode */
bool clientCanCacheRead:1; /* read oplock */
bool clientCanCacheAll:1; /* read and writebehind oplock */
- bool oplockPending:1;
bool delete_pending:1; /* DELETE_ON_CLOSE is set */
u64 server_eof; /* current file size on server */
u64 uniqueid; /* server inode number */
@@ -572,9 +589,9 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_LANMAN 0x10010
#define CIFSSEC_MUST_PLNTXT 0x20020
#ifdef CONFIG_CIFS_UPCALL
-#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */
+#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
#else
-#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */
+#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
#endif /* UPCALL */
#else /* do not allow weak pw hash */
#ifdef CONFIG_CIFS_UPCALL
@@ -656,8 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
*/
GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
-
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
/* DirNotify response queue */
@@ -708,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index da8fbf565991..5646727e33f5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
const int stage,
const struct nls_table *nls_cp);
extern __u16 GetNextMid(struct TCP_Server_Info *server);
-extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
- struct cifsTconInfo *);
-extern void DeleteOplockQEntry(struct oplock_q_entry *);
-extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
+extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
+ __u16 fileHandle, struct file *file,
+ struct vfsmount *mnt, unsigned int oflags);
extern int cifs_posix_open(char *full_path, struct inode **pinode,
- struct super_block *sb, int mode, int oflags,
- int *poplock, __u16 *pnetfid, int xid);
+ struct vfsmount *mnt, int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid);
extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
FILE_UNIX_BASIC_INFO *info,
struct cifs_sb_info *cifs_sb);
@@ -389,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
const struct nls_table *nls_codepage, int remap_special_chars);
extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..941441d3e386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -94,116 +94,145 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
open_file = list_entry(tmp, struct cifsFileInfo, tlist);
open_file->invalidHandle = true;
+ open_file->oplock_break_cancelled = true;
}
write_unlock(&GlobalSMBSeslock);
/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
to this tcon */
}
-/* Allocate and return pointer to an SMB request buffer, and set basic
- SMB information in the SMB header. If the return code is zero, this
- function must have filled in request_buf pointer */
+/* reconnect the socket, tcon, and smb session if needed */
static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
- void **request_buf)
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
{
int rc = 0;
+ struct cifsSesInfo *ses;
+ struct TCP_Server_Info *server;
+ struct nls_table *nls_codepage;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
+ /*
+ * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+ * tcp and smb session status done differently for those three - in the
+ * calling routine
+ */
+ if (!tcon)
+ return 0;
+
+ ses = tcon->ses;
+ server = ses->server;
+
+ /*
+ * only tree disconnect, open, and write, (and ulogoff which does not
+ * have tcon) are allowed as we start force umount
+ */
+ if (tcon->tidStatus == CifsExiting) {
+ if (smb_command != SMB_COM_WRITE_ANDX &&
+ smb_command != SMB_COM_OPEN_ANDX &&
+ smb_command != SMB_COM_TREE_DISCONNECT) {
+ cFYI(1, ("can not send cmd %d while umounting",
+ smb_command));
+ return -ENODEV;
}
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes back on-line */
- } else /* TCP session is reestablished now */
- break;
- }
+ }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* we do not know sb */,
- NULL /* no vol info */);
- }
+ if (ses->status == CifsExiting)
+ return -EIO;
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
+ /*
+ * Give demultiplex thread up to 10 seconds to reconnect, should be
+ * greater than cifs socket timeout which is 7 seconds
+ */
+ while (server->tcpStatus == CifsNeedReconnect) {
+ wait_event_interruptible_timeout(server->response_q,
+ (server->tcpStatus == CifsGood), 10 * HZ);
- } else {
- return -EIO;
+ /* is TCP session is reestablished now ?*/
+ if (server->tcpStatus != CifsNeedReconnect)
+ break;
+
+ /*
+ * on "soft" mounts we wait once. Hard mounts keep
+ * retrying until process is killed or server comes
+ * back on-line
+ */
+ if (!tcon->retry || ses->status == CifsExiting) {
+ cFYI(1, ("gave up waiting on reconnect in smb_init"));
+ return -EHOSTDOWN;
}
}
+
+ if (!ses->need_reconnect && !tcon->need_reconnect)
+ return 0;
+
+ nls_codepage = load_nls_default();
+
+ /*
+ * need to prevent multiple threads trying to simultaneously
+ * reconnect the same SMB session
+ */
+ down(&ses->sesSem);
+ if (ses->need_reconnect)
+ rc = cifs_setup_session(0, ses, nls_codepage);
+
+ /* do we need to reconnect tcon? */
+ if (rc || !tcon->need_reconnect) {
+ up(&ses->sesSem);
+ goto out;
+ }
+
+ mark_open_files_invalid(tcon);
+ rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+ up(&ses->sesSem);
+ cFYI(1, ("reconnect tcon rc = %d", rc));
+
+ if (rc)
+ goto out;
+
+ /*
+ * FIXME: check if wsize needs updated due to negotiated smb buffer
+ * size shrinking
+ */
+ atomic_inc(&tconInfoReconnectCount);
+
+ /* tell server Unix caps we support */
+ if (ses->capabilities & CAP_UNIX)
+ reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+ /*
+ * Removed call to reopen open files here. It is safer (and faster) to
+ * reopen files one at a time as needed in read and write.
+ *
+ * FIXME: what about file locks? don't we need to reclaim them ASAP?
+ */
+
+out:
+ /*
+ * Check if handle based operation so we know whether we can continue
+ * or not without returning to caller to reset file handle
+ */
+ switch (smb_command) {
+ case SMB_COM_READ_ANDX:
+ case SMB_COM_WRITE_ANDX:
+ case SMB_COM_CLOSE:
+ case SMB_COM_FIND_CLOSE2:
+ case SMB_COM_LOCKING_ANDX:
+ rc = -EAGAIN;
+ }
+
+ unload_nls(nls_codepage);
+ return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+ SMB information in the SMB header. If the return code is zero, this
+ function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf)
+{
+ int rc = 0;
+
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -256,101 +285,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
{
int rc = 0;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
- }
-
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes on-line */
- } else /* TCP session is reestablished now */
- break;
- }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* do not know sb */,
- NULL /* no vol info */);
- }
-
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
-
- } else {
- return -EIO;
- }
- }
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -3961,6 +3896,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
if (is_unicode) {
__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
GFP_KERNEL);
+ if (tmp == NULL) {
+ rc = -ENOMEM;
+ goto parse_DFS_referrals_exit;
+ }
cifsConvertToUCS((__le16 *) tmp, searchName,
PATH_MAX, nls_codepage, remap);
node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..63ea83ff687f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
}
static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
{
struct list_head *tmp;
struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
if (server->tcpStatus == CifsNew)
continue;
- if (addr->ss_family == AF_INET &&
- (addr4->sin_addr.s_addr !=
- server->addr.sockAddr.sin_addr.s_addr))
- continue;
- else if (addr->ss_family == AF_INET6 &&
- (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
- &addr6->sin6_addr) ||
- server->addr.sockAddr6.sin6_scope_id !=
- addr6->sin6_scope_id))
- continue;
+ switch (addr->ss_family) {
+ case AF_INET:
+ if (addr4->sin_addr.s_addr ==
+ server->addr.sockAddr.sin_addr.s_addr) {
+ addr4->sin_port = htons(port);
+ /* user overrode default port? */
+ if (addr4->sin_port) {
+ if (addr4->sin_port !=
+ server->addr.sockAddr.sin_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+
+ case AF_INET6:
+ if (ipv6_addr_equal(&addr6->sin6_addr,
+ &server->addr.sockAddr6.sin6_addr) &&
+ (addr6->sin6_scope_id ==
+ server->addr.sockAddr6.sin6_scope_id)) {
+ addr6->sin6_port = htons(port);
+ /* user overrode default port? */
+ if (addr6->sin6_port) {
+ if (addr6->sin6_port !=
+ server->addr.sockAddr6.sin6_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+ }
++server->srv_count;
write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
}
/* see if we already have a matching tcp_ses */
- tcp_ses = cifs_find_tcp_session(&addr);
+ tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
if (tcp_ses)
return tcp_ses;
@@ -1556,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
out_err:
if (tcp_ses) {
- kfree(tcp_ses->hostname);
+ if (!IS_ERR(tcp_ses->hostname))
+ kfree(tcp_ses->hostname);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -1649,7 +1671,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
CIFSSMBTDis(xid, tcon);
_FreeXid(xid);
- DeleteTconOplockQEntries(tcon);
tconInfoFree(tcon);
cifs_put_smb_ses(ses);
}
@@ -2199,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
{
int rc;
- __u64 inode_num;
FILE_ALL_INFO *pfile_info;
- rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc != -EOPNOTSUPP)
- return rc;
-
pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
if (pfile_info == NULL)
return -ENOMEM;
@@ -2636,9 +2649,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
return -EIO;
smb_buffer = cifs_buf_get();
- if (smb_buffer == NULL) {
+ if (smb_buffer == NULL)
return -ENOMEM;
- }
+
smb_buffer_response = smb_buffer;
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..6ccf7262d1b7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -24,6 +24,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include "cifsfs.h"
#include "cifspdu.h"
#include "cifsglob.h"
@@ -129,44 +130,45 @@ cifs_bp_rename_retry:
return full_path;
}
-static void
-cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
- struct cifsTconInfo *tcon, bool write_only)
+struct cifsFileInfo *
+cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
+ struct file *file, struct vfsmount *mnt, unsigned int oflags)
{
int oplock = 0;
struct cifsFileInfo *pCifsFile;
struct cifsInodeInfo *pCifsInode;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
if (pCifsFile == NULL)
- return;
+ return pCifsFile;
if (oplockEnabled)
oplock = REQ_OPLOCK;
pCifsFile->netfid = fileHandle;
pCifsFile->pid = current->tgid;
- pCifsFile->pInode = newinode;
+ pCifsFile->pInode = igrab(newinode);
+ pCifsFile->mnt = mnt;
+ pCifsFile->pfile = file;
pCifsFile->invalidHandle = false;
pCifsFile->closePend = false;
mutex_init(&pCifsFile->fh_mutex);
mutex_init(&pCifsFile->lock_mutex);
INIT_LIST_HEAD(&pCifsFile->llist);
- atomic_set(&pCifsFile->wrtPending, 0);
+ atomic_set(&pCifsFile->count, 1);
+ slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
- /* set the following in open now
- pCifsFile->pfile = file; */
write_lock(&GlobalSMBSeslock);
- list_add(&pCifsFile->tlist, &tcon->openFileList);
+ list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
pCifsInode = CIFS_I(newinode);
if (pCifsInode) {
/* if readable file instance put first in list*/
- if (write_only)
+ if (oflags & FMODE_READ)
+ list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+ else
list_add_tail(&pCifsFile->flist,
&pCifsInode->openFileList);
- else
- list_add(&pCifsFile->flist, &pCifsInode->openFileList);
if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
pCifsInode->clientCanCacheAll = true;
@@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
pCifsInode->clientCanCacheRead = true;
}
write_unlock(&GlobalSMBSeslock);
+
+ return pCifsFile;
}
int cifs_posix_open(char *full_path, struct inode **pinode,
- struct super_block *sb, int mode, int oflags,
- int *poplock, __u16 *pnetfid, int xid)
+ struct vfsmount *mnt, int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid)
{
int rc;
- __u32 oplock;
- bool write_only = false;
FILE_UNIX_BASIC_INFO *presp_data;
__u32 posix_flags = 0;
- struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
struct cifs_fattr fattr;
cFYI(1, ("posix open %s", full_path));
@@ -212,9 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
posix_flags |= SMB_O_EXCL;
if (oflags & O_TRUNC)
posix_flags |= SMB_O_TRUNC;
- if (oflags & O_APPEND)
- posix_flags |= SMB_O_APPEND;
- if (oflags & O_SYNC)
+ /* be safe and imply O_SYNC for O_DSYNC */
+ if (oflags & O_DSYNC)
posix_flags |= SMB_O_SYNC;
if (oflags & O_DIRECTORY)
posix_flags |= SMB_O_DIRECTORY;
@@ -223,12 +224,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
if (oflags & O_DIRECT)
posix_flags |= SMB_O_DIRECT;
- if (!(oflags & FMODE_READ))
- write_only = true;
-
mode &= ~current_umask();
rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
- pnetfid, presp_data, &oplock, full_path,
+ pnetfid, presp_data, poplock, full_path,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc)
@@ -244,7 +242,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
/* get new inode and set it up */
if (*pinode == NULL) {
- *pinode = cifs_iget(sb, &fattr);
+ *pinode = cifs_iget(mnt->mnt_sb, &fattr);
if (!*pinode) {
rc = -ENOMEM;
goto posix_open_ret;
@@ -253,7 +251,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
cifs_fattr_to_inode(*pinode, &fattr);
}
- cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
+ cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
posix_open_ret:
kfree(presp_data);
@@ -280,7 +278,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
int rc = -ENOENT;
int xid;
int create_options = CREATE_NOT_DIR;
- int oplock = 0;
+ __u32 oplock = 0;
int oflags;
bool posix_create = false;
/*
@@ -298,7 +296,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
FILE_ALL_INFO *buf = NULL;
struct inode *newinode = NULL;
int disposition = FILE_OVERWRITE_IF;
- bool write_only = false;
xid = GetXid();
@@ -323,7 +320,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
- rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+ rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
mode, oflags, &oplock, &fileHandle, xid);
/* EIO could indicate that (posix open) operation is not
supported, despite what server claimed in capability
@@ -351,11 +348,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
desiredAccess = 0;
if (oflags & FMODE_READ)
desiredAccess |= GENERIC_READ; /* is this too little? */
- if (oflags & FMODE_WRITE) {
+ if (oflags & FMODE_WRITE)
desiredAccess |= GENERIC_WRITE;
- if (!(oflags & FMODE_READ))
- write_only = true;
- }
if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
disposition = FILE_CREATE;
@@ -470,8 +464,8 @@ cifs_create_set_dentry:
/* mknod case - do not leave file open */
CIFSSMBClose(xid, tcon, fileHandle);
} else if (!(posix_create) && (newinode)) {
- cifs_fill_fileinfo(newinode, fileHandle,
- cifs_sb->tcon, write_only);
+ cifs_new_fileinfo(newinode, fileHandle, NULL,
+ nd->path.mnt, oflags);
}
cifs_create_out:
kfree(buf);
@@ -611,7 +605,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
{
int xid;
int rc = 0; /* to get around spurious gcc warning, set to zero here */
- int oplock = 0;
+ __u32 oplock = 0;
__u16 fileHandle = 0;
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
@@ -648,9 +642,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* O_EXCL: optimize away the lookup, but don't hash the dentry. Let
* the VFS handle the create.
*/
- if (nd->flags & LOOKUP_EXCL) {
+ if (nd && (nd->flags & LOOKUP_EXCL)) {
d_instantiate(direntry, NULL);
- return 0;
+ return NULL;
}
/* can not grab the rename sem here since it would
@@ -680,11 +674,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* reduction in network traffic in the other paths.
*/
if (pTcon->unix_ext) {
- if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+ if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
(nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
(nd->intent.open.flags & O_CREAT)) {
- rc = cifs_posix_open(full_path, &newInode,
- parent_dir_inode->i_sb,
+ rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
nd->intent.open.create_mode,
nd->intent.open.flags, &oplock,
&fileHandle, xid);
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 75949d6a5f1b..6177f7cca16a 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -24,7 +24,7 @@
*/
/*
- * See Documentation/filesystems/Exporting
+ * See Documentation/filesystems/nfs/Exporting
* and examples in fs/exportfs
*
* Since cifs is a network file system, an "fsid" must be included for
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..057e1dae12ab 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -30,6 +30,7 @@
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/delay.h>
+#include <linux/mount.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -39,29 +40,6 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
-static inline struct cifsFileInfo *cifs_init_private(
- struct cifsFileInfo *private_data, struct inode *inode,
- struct file *file, __u16 netfid)
-{
- memset(private_data, 0, sizeof(struct cifsFileInfo));
- private_data->netfid = netfid;
- private_data->pid = current->tgid;
- mutex_init(&private_data->fh_mutex);
- mutex_init(&private_data->lock_mutex);
- INIT_LIST_HEAD(&private_data->llist);
- private_data->pfile = file; /* needed for writepage */
- private_data->pInode = inode;
- private_data->invalidHandle = false;
- private_data->closePend = false;
- /* we have to track num writers to the inode, since writepages
- does not tell us which handle the write is for so there can
- be a close (overlapping with write) of the filehandle that
- cifs_writepages chose to use */
- atomic_set(&private_data->wrtPending, 0);
-
- return private_data;
-}
-
static inline int cifs_convert_flags(unsigned int flags)
{
if ((flags & O_ACCMODE) == O_RDONLY)
@@ -98,8 +76,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
reopening a file. They had their effect on the original open */
if (flags & O_APPEND)
posix_flags |= (fmode_t)O_APPEND;
- if (flags & O_SYNC)
- posix_flags |= (fmode_t)O_SYNC;
+ if (flags & O_DSYNC)
+ posix_flags |= (fmode_t)O_DSYNC;
+ if (flags & __O_SYNC)
+ posix_flags |= (fmode_t)__O_SYNC;
if (flags & O_DIRECTORY)
posix_flags |= (fmode_t)O_DIRECTORY;
if (flags & O_NOFOLLOW)
@@ -125,9 +105,11 @@ static inline int cifs_get_disposition(unsigned int flags)
}
/* all arguments to this function must be checked for validity in caller */
-static inline int cifs_posix_open_inode_helper(struct inode *inode,
- struct file *file, struct cifsInodeInfo *pCifsInode,
- struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
+static inline int
+cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
+ struct cifsInodeInfo *pCifsInode,
+ struct cifsFileInfo *pCifsFile, __u32 oplock,
+ u16 netfid)
{
write_lock(&GlobalSMBSeslock);
@@ -221,17 +203,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
struct timespec temp;
int rc;
- /* want handles we can use to read with first
- in the list so we do not have to walk the
- list to search for one in write_begin */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
- list_add_tail(&pCifsFile->flist,
- &pCifsInode->openFileList);
- } else {
- list_add(&pCifsFile->flist,
- &pCifsInode->openFileList);
- }
- write_unlock(&GlobalSMBSeslock);
if (pCifsInode->clientCanCacheRead) {
/* we have the inode open somewhere else
no need to discard cache data */
@@ -281,7 +252,8 @@ client_can_cache:
int cifs_open(struct inode *inode, struct file *file)
{
int rc = -EACCES;
- int xid, oplock;
+ int xid;
+ __u32 oplock;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *tcon;
struct cifsFileInfo *pCifsFile;
@@ -326,7 +298,7 @@ int cifs_open(struct inode *inode, struct file *file)
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+ rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -416,24 +388,17 @@ int cifs_open(struct inode *inode, struct file *file)
cFYI(1, ("cifs_open returned 0x%x", rc));
goto out;
}
- file->private_data =
- kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+ pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
+ file->f_flags);
+ file->private_data = pCifsFile;
if (file->private_data == NULL) {
rc = -ENOMEM;
goto out;
}
- pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
- write_lock(&GlobalSMBSeslock);
- list_add(&pCifsFile->tlist, &tcon->openFileList);
- pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
- if (pCifsInode) {
- rc = cifs_open_inode_helper(inode, file, pCifsInode,
- pCifsFile, tcon,
- &oplock, buf, full_path, xid);
- } else {
- write_unlock(&GlobalSMBSeslock);
- }
+ rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
+ &oplock, buf, full_path, xid);
if (oplock & CIFS_CREATE_ACTION) {
/* time to set mode which we can not set earlier due to
@@ -476,7 +441,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
static int cifs_reopen_file(struct file *file, bool can_flush)
{
int rc = -EACCES;
- int xid, oplock;
+ int xid;
+ __u32 oplock;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *tcon;
struct cifsFileInfo *pCifsFile;
@@ -545,7 +511,7 @@ reopen_error_exit:
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+ rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -643,7 +609,7 @@ int cifs_close(struct inode *inode, struct file *file)
if (!pTcon->need_reconnect) {
write_unlock(&GlobalSMBSeslock);
timeout = 2;
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
+ while ((atomic_read(&pSMBFile->count) != 1)
&& (timeout <= 2048)) {
/* Give write a better chance to get to
server ahead of the close. We do not
@@ -657,8 +623,6 @@ int cifs_close(struct inode *inode, struct file *file)
msleep(timeout);
timeout *= 4;
}
- if (atomic_read(&pSMBFile->wrtPending))
- cERROR(1, ("close with pending write"));
if (!pTcon->need_reconnect &&
!pSMBFile->invalidHandle)
rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +645,7 @@ int cifs_close(struct inode *inode, struct file *file)
list_del(&pSMBFile->flist);
list_del(&pSMBFile->tlist);
write_unlock(&GlobalSMBSeslock);
- timeout = 10;
- /* We waited above to give the SMBWrite a chance to issue
- on the wire (so we do not get SMBWrite returning EBADF
- if writepages is racing with close. Note that writepages
- does not specify a file handle, so it is possible for a file
- to be opened twice, and the application close the "wrong"
- file handle - in these cases we delay long enough to allow
- the SMBWrite to get on the wire before the SMB Close.
- We allow total wait here over 45 seconds, more than
- oplock break time, and more than enough to allow any write
- to complete on the server, or to time out on the client */
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
- && (timeout <= 50000)) {
- cERROR(1, ("writes pending, delay free of handle"));
- msleep(timeout);
- timeout *= 8;
- }
- kfree(file->private_data);
+ cifsFileInfo_put(file->private_data);
file->private_data = NULL;
} else
rc = -EBADF;
@@ -1236,7 +1183,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
if (!open_file->invalidHandle) {
/* found a good file */
/* lock it so it will not be closed on us */
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
read_unlock(&GlobalSMBSeslock);
return open_file;
} /* else might as well continue, and look for
@@ -1276,7 +1223,7 @@ refind_writable:
if (open_file->pfile &&
((open_file->pfile->f_flags & O_RDWR) ||
(open_file->pfile->f_flags & O_WRONLY))) {
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -1293,7 +1240,7 @@ refind_writable:
else { /* start over in case this was deleted */
/* since the list could be modified */
read_lock(&GlobalSMBSeslock);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
goto refind_writable;
}
}
@@ -1309,7 +1256,7 @@ refind_writable:
read_lock(&GlobalSMBSeslock);
/* can not use this handle, no write
pending on this one after all */
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
if (open_file->closePend) /* list could have changed */
goto refind_writable;
@@ -1373,7 +1320,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (open_file) {
bytes_written = cifs_write(open_file->pfile, write_data,
to-from, &offset);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
if ((bytes_written > 0) && (offset))
@@ -1562,7 +1509,7 @@ retry:
bytes_to_write, offset,
&bytes_written, iov, n_iov,
long_op);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cifs_update_eof(cifsi, offset, bytes_written);
if (rc || bytes_written < bytes_to_write) {
@@ -2329,6 +2276,73 @@ out:
return rc;
}
+static void
+cifs_oplock_break(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ struct inode *inode = cfile->pInode;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
+ int rc, waitrc = 0;
+
+ if (inode && S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+ if (cinode->clientCanCacheAll == 0)
+ break_lease(inode, FMODE_READ);
+ else if (cinode->clientCanCacheRead == 0)
+ break_lease(inode, FMODE_WRITE);
+#endif
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (cinode->clientCanCacheRead == 0) {
+ waitrc = filemap_fdatawait(inode->i_mapping);
+ invalidate_remote_inode(inode);
+ }
+ if (!rc)
+ rc = waitrc;
+ if (rc)
+ cinode->write_behind_rc = rc;
+ cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+ }
+
+ /*
+ * releasing stale oplock after recent reconnect of smb session using
+ * a now incorrect file handle is not a data integrity issue but do
+ * not bother sending an oplock release if session to server still is
+ * disconnected since oplock already released by the server
+ */
+ if (!cfile->closePend && !cfile->oplock_break_cancelled) {
+ rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
+ LOCKING_ANDX_OPLOCK_RELEASE, false);
+ cFYI(1, ("Oplock release rc = %d", rc));
+ }
+}
+
+static int
+cifs_oplock_break_get(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ mntget(cfile->mnt);
+ cifsFileInfo_get(cfile);
+ return 0;
+}
+
+static void
+cifs_oplock_break_put(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ mntput(cfile->mnt);
+ cifsFileInfo_put(cfile);
+}
+
+const struct slow_work_ops cifs_oplock_break_ops = {
+ .get_ref = cifs_oplock_break_get,
+ .put_ref = cifs_oplock_break_put,
+ .execute = cifs_oplock_break,
+};
+
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
.readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..cababd8a52df 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc1) {
+ if (rc1 || !fattr.cf_uniqueid) {
cFYI(1, ("GetSrvInodeNum rc %d", rc1));
fattr.cf_uniqueid = iunique(sb, ROOT_I);
- /* disable serverino if call not supported */
- if (rc1 == -EINVAL)
- cifs_sb->mnt_cifs_flags &=
- ~CIFS_MOUNT_SERVER_INUM;
+ cifs_autodisable_serverino(cifs_sb);
}
} else {
fattr.cf_uniqueid = iunique(sb, ROOT_I);
@@ -800,7 +797,7 @@ set_via_filehandle:
if (open_file == NULL)
CIFSSMBClose(xid, pTcon, netfid);
else
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
out:
return rc;
}
@@ -1557,57 +1554,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
static int cifs_vmtruncate(struct inode *inode, loff_t offset)
{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
+ loff_t oldsize;
+ int err;
spin_lock(&inode->i_lock);
- if (inode->i_size < offset)
- goto do_expand;
- /*
- * truncation of in-use swapfiles is disallowed - it would cause
- * subsequent swapout to scribble on the now-freed blocks.
- */
- if (IS_SWAPFILE(inode)) {
- spin_unlock(&inode->i_lock);
- goto out_busy;
- }
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
- /*
- * unmap_mapping_range is called twice, first simply for efficiency
- * so that truncate_inode_pages does fewer single-page unmaps. However
- * after this first call, and before truncate_inode_pages finishes,
- * it is possible for private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second unmap_mapping_range
- * call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- goto out_truncate;
-
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit) {
+ err = inode_newsize_ok(inode, offset);
+ if (err) {
spin_unlock(&inode->i_lock);
- goto out_sig;
- }
- if (offset > inode->i_sb->s_maxbytes) {
- spin_unlock(&inode->i_lock);
- goto out_big;
+ goto out;
}
+
+ oldsize = inode->i_size;
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
-out_truncate:
+ truncate_pagecache(inode, oldsize, offset);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
-out_busy:
- return -ETXTBSY;
+out:
+ return err;
}
static int
@@ -1635,7 +1599,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
__u32 npid = open_file->pid;
rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
npid, false);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cFYI(1, ("SetFSize for attrs rc = %d", rc));
if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
unsigned int bytes_written;
@@ -1790,7 +1754,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
u16 nfid = open_file->netfid;
u32 npid = open_file->pid;
rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
} else {
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
cifs_sb->local_nls,
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e079a9190ec4..d27d4ec6579b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -32,7 +32,6 @@
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
-extern struct task_struct *oplockThread;
/* The xid serves as a useful identifier for each incoming vfs request,
in a similar way to the mid which is useful to track each sent smb,
@@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
struct cifsTconInfo *tcon;
struct cifsInodeInfo *pCifsInode;
struct cifsFileInfo *netfile;
+ int rc;
cFYI(1, ("Checking for oplock break or dnotify response"));
if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
continue;
cifs_stats_inc(&tcon->num_oplock_brks);
- write_lock(&GlobalSMBSeslock);
+ read_lock(&GlobalSMBSeslock);
list_for_each(tmp2, &tcon->openFileList) {
netfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
if (pSMB->Fid != netfile->netfid)
continue;
- write_unlock(&GlobalSMBSeslock);
- read_unlock(&cifs_tcp_ses_lock);
+ /*
+ * don't do anything if file is about to be
+ * closed anyway.
+ */
+ if (netfile->closePend) {
+ read_unlock(&GlobalSMBSeslock);
+ read_unlock(&cifs_tcp_ses_lock);
+ return true;
+ }
+
cFYI(1, ("file id match, oplock break"));
pCifsInode = CIFS_I(netfile->pInode);
pCifsInode->clientCanCacheAll = false;
if (pSMB->OplockLevel == 0)
pCifsInode->clientCanCacheRead = false;
- pCifsInode->oplockPending = true;
- AllocOplockQEntry(netfile->pInode,
- netfile->netfid, tcon);
- cFYI(1, ("about to wake up oplock thread"));
- if (oplockThread)
- wake_up_process(oplockThread);
-
+ rc = slow_work_enqueue(&netfile->oplock_break);
+ if (rc) {
+ cERROR(1, ("failed to enqueue oplock "
+ "break: %d\n", rc));
+ } else {
+ netfile->oplock_break_cancelled = false;
+ }
+ read_unlock(&GlobalSMBSeslock);
+ read_unlock(&cifs_tcp_ses_lock);
return true;
}
- write_unlock(&GlobalSMBSeslock);
+ read_unlock(&GlobalSMBSeslock);
read_unlock(&cifs_tcp_ses_lock);
cFYI(1, ("No matching file for oplock break"));
return true;
@@ -705,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
ctoUCS_out:
return i;
}
+
+void
+cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
+{
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+ cERROR(1, ("Autodisabling the use of server inode numbers on "
+ "%s. This server doesn't seem to support them "
+ "properly. Hardlinks will not be recognized on this "
+ "mount. Consider mounting with the \"noserverino\" "
+ "option to silence this message.",
+ cifs_sb->tcon->treeName));
+ }
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f823a4a208a7..f84062f9a985 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
}
-void
+static void
cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
struct cifs_sb_info *cifs_sb)
{
@@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
cifs_fill_common_info(fattr, cifs_sb);
}
-void
+static void
cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
struct cifs_sb_info *cifs_sb)
{
@@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
pfindEntry, cifs_sb);
- /* FIXME: make _to_fattr functions fill this out */
- if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+ if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
fattr.cf_uniqueid = inum;
- else
+ } else {
fattr.cf_uniqueid = iunique(sb, ROOT_I);
+ cifs_autodisable_serverino(cifs_sb);
+ }
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..07b8e71544ee 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -103,57 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
mempool_free(midEntry, cifs_mid_poolp);
}
-struct oplock_q_entry *
-AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
-{
- struct oplock_q_entry *temp;
- if ((pinode == NULL) || (tcon == NULL)) {
- cERROR(1, ("Null parms passed to AllocOplockQEntry"));
- return NULL;
- }
- temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
- GFP_KERNEL);
- if (temp == NULL)
- return temp;
- else {
- temp->pinode = pinode;
- temp->tcon = tcon;
- temp->netfid = fid;
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&temp->qhead, &GlobalOplock_Q);
- spin_unlock(&GlobalMid_Lock);
- }
- return temp;
-
-}
-
-void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
-{
- spin_lock(&GlobalMid_Lock);
- /* should we check if list empty first? */
- list_del(&oplockEntry->qhead);
- spin_unlock(&GlobalMid_Lock);
- kmem_cache_free(cifs_oplock_cachep, oplockEntry);
-}
-
-
-void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
-{
- struct oplock_q_entry *temp;
-
- if (tcon == NULL)
- return;
-
- spin_lock(&GlobalMid_Lock);
- list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
- if ((temp->tcon) && (temp->tcon == tcon)) {
- list_del(&temp->qhead);
- kmem_cache_free(cifs_oplock_cachep, temp);
- }
- }
- spin_unlock(&GlobalMid_Lock);
-}
-
static int
smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
{
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
#define _CODA_INT_
struct dentry;
+struct file;
extern struct file_system_type coda_fs_type;
extern unsigned long coda_timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/time.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/ioport.h>
#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index 6d6f98fe64a0..6c19040ffeef 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -100,13 +100,6 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
get_compat_timespec(&tv[1], &t[1]))
return -EFAULT;
- if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW)
- && tv[0].tv_sec != 0)
- return -EINVAL;
- if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW)
- && tv[1].tv_sec != 0)
- return -EINVAL;
-
if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
return 0;
}
@@ -775,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
{
- unsigned long type_page;
+ char *kernel_type;
unsigned long data_page;
- unsigned long dev_page;
+ char *kernel_dev;
char *dir_page;
int retval;
- retval = copy_mount_options (type, &type_page);
+ retval = copy_mount_string(type, &kernel_type);
if (retval < 0)
goto out;
@@ -790,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
if (IS_ERR(dir_page))
goto out1;
- retval = copy_mount_options (dev_name, &dev_page);
+ retval = copy_mount_string(dev_name, &kernel_dev);
if (retval < 0)
goto out2;
- retval = copy_mount_options (data, &data_page);
+ retval = copy_mount_options(data, &data_page);
if (retval < 0)
goto out3;
retval = -EINVAL;
- if (type_page && data_page) {
- if (!strcmp((char *)type_page, SMBFS_NAME)) {
+ if (kernel_type && data_page) {
+ if (!strcmp(kernel_type, SMBFS_NAME)) {
do_smb_super_data_conv((void *)data_page);
- } else if (!strcmp((char *)type_page, NCPFS_NAME)) {
+ } else if (!strcmp(kernel_type, NCPFS_NAME)) {
do_ncp_super_data_conv((void *)data_page);
- } else if (!strcmp((char *)type_page, NFS4_NAME)) {
+ } else if (!strcmp(kernel_type, NFS4_NAME)) {
if (do_nfs4_super_data_conv((void *) data_page))
goto out4;
}
}
- retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
+ retval = do_mount(kernel_dev, dir_page, kernel_type,
flags, (void*)data_page);
out4:
free_page(data_page);
out3:
- free_page(dev_page);
+ kfree(kernel_dev);
out2:
putname(dir_page);
out1:
- free_page(type_page);
+ kfree(kernel_type);
out:
return retval;
}
@@ -1539,6 +1532,8 @@ int compat_do_execve(char * filename,
if (retval < 0)
goto out;
+ current->stack_start = current->mm->start_stack;
+
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f91fd51b32e3..d84e7058c298 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1800,7 +1800,7 @@ struct space_resv_32 {
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
{
- struct space_resv_32 __user *p32 = (void __user *)arg;
+ struct space_resv_32 __user *p32 = compat_ptr(arg);
struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
@@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
#else
case FS_IOC_RESVSP:
case FS_IOC_RESVSP64:
- error = ioctl_preallocate(filp, (void __user *)arg);
+ error = ioctl_preallocate(filp, compat_ptr(arg));
goto out_fput;
#endif
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
};
static struct backing_dev_info configfs_backing_dev_info = {
+ .name = "configfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
#include <linux/swap.h>
#include <linux/bootmem.h>
#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
#include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 75efb028974b..d5f8c96964be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,14 +18,13 @@
#include <linux/mount.h>
#include <linux/tty.h>
#include <linux/mutex.h>
+#include <linux/magic.h>
#include <linux/idr.h>
#include <linux/devpts_fs.h>
#include <linux/parser.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
-#define DEVPTS_SUPER_MAGIC 0x1cd1
-
#define DEVPTS_DEFAULT_MODE 0600
/*
* ptmx is a new node in /dev/pts and will be unused in legacy (single-
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1d1d27442235..1c8bb8c3a82e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
return rv;
}
-static struct seq_operations format1_seq_ops;
-static struct seq_operations format2_seq_ops;
-static struct seq_operations format3_seq_ops;
+static const struct seq_operations format1_seq_ops;
+static const struct seq_operations format2_seq_ops;
+static const struct seq_operations format3_seq_ops;
static void *table_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
}
}
-static struct seq_operations format1_seq_ops = {
+static const struct seq_operations format1_seq_ops = {
.start = table_seq_start,
.next = table_seq_next,
.stop = table_seq_stop,
.show = table_seq_show,
};
-static struct seq_operations format2_seq_ops = {
+static const struct seq_operations format2_seq_ops = {
.start = table_seq_start,
.next = table_seq_next,
.stop = table_seq_stop,
.show = table_seq_show,
};
-static struct seq_operations format3_seq_ops = {
+static const struct seq_operations format3_seq_ops = {
.start = table_seq_start,
.next = table_seq_next,
.stop = table_seq_stop,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..70736eb4b516 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
#define CF_CONNECT_PENDING 3
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
static inline void lowcomms_connect_sock(struct connection *con)
{
+ if (test_bit(CF_CLOSE, &con->flags))
+ return;
if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -313,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
+ /* with sctp there's no connecting without sending */
+ if (dlm_config.ci_protocol != 0)
+ return 0;
+
if (nodeid == dlm_our_nodeid())
return 0;
@@ -452,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
int prim_len, ret;
int addr_len;
struct connection *new_con;
- struct file *file;
sctp_peeloff_arg_t parg;
int parglen = sizeof(parg);
+ int err;
/*
* We get this before any data for an association.
@@ -509,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
SCTP_SOCKOPT_PEELOFF,
(void *)&parg, &parglen);
- if (ret) {
+ if (ret < 0) {
log_print("Can't peel off a socket for "
- "connection %d to node %d: err=%d\n",
+ "connection %d to node %d: err=%d",
parg.associd, nodeid, ret);
+ return;
+ }
+ new_con->sock = sockfd_lookup(parg.sd, &err);
+ if (!new_con->sock) {
+ log_print("sockfd_lookup error %d", err);
+ return;
}
- file = fget(parg.sd);
- new_con->sock = SOCKET_I(file->f_dentry->d_inode);
add_sock(new_con->sock, new_con);
- fput(file);
- put_unused_fd(parg.sd);
+ sockfd_put(new_con->sock);
- log_print("got new/restarted association %d nodeid %d",
- (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+ log_print("connecting to %d sctp association %d",
+ nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -834,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
if (con->retries++ > MAX_CONNECT_RETRIES)
return;
- log_print("Initiating association with node %d", con->nodeid);
-
if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
log_print("no address for nodeid %d", con->nodeid);
return;
@@ -852,11 +860,14 @@ static void sctp_init_assoc(struct connection *con)
outmessage.msg_flags = MSG_EOR;
spin_lock(&con->writequeue_lock);
- e = list_entry(con->writequeue.next, struct writequeue_entry,
- list);
- BUG_ON((struct list_head *) e == &con->writequeue);
+ if (list_empty(&con->writequeue)) {
+ spin_unlock(&con->writequeue_lock);
+ log_print("writequeue empty for nodeid %d", con->nodeid);
+ return;
+ }
+ e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
spin_unlock(&con->writequeue_lock);
@@ -926,10 +937,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
- sock_release(sock);
+ if (dlm_nodeid_to_addr(con->nodeid, &saddr))
goto out_err;
- }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
@@ -1284,7 +1293,6 @@ out:
static void send_to_sock(struct connection *con)
{
int ret = 0;
- ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset;
@@ -1293,8 +1301,6 @@ static void send_to_sock(struct connection *con)
if (con->sock == NULL)
goto out_connect;
- sendpage = con->sock->ops->sendpage;
-
spin_lock(&con->writequeue_lock);
for (;;) {
e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1315,8 @@ static void send_to_sock(struct connection *con)
ret = 0;
if (len) {
- ret = sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
if (ret == -EAGAIN || ret == 0) {
cond_resched();
goto out;
@@ -1370,6 +1376,13 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
+ clear_bit(CF_WRITE_PENDING, &con->flags);
+ set_bit(CF_CLOSE, &con->flags);
+ if (cancel_work_sync(&con->swork))
+ log_print("canceled swork for node %d", nodeid);
+ if (cancel_work_sync(&con->rwork))
+ log_print("canceled rwork for node %d", nodeid);
clean_one_writequeue(con);
close_connection(con, true);
}
@@ -1395,9 +1408,10 @@ static void process_send_sockets(struct work_struct *work)
if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
con->connect_action(con);
+ set_bit(CF_WRITE_PENDING, &con->flags);
}
- clear_bit(CF_WRITE_PENDING, &con->flags);
- send_to_sock(con);
+ if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+ send_to_sock(con);
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
return rv;
}
- return genlmsg_unicast(skb, listener_nlpid);
+ return genlmsg_unicast(&init_net, skb, listener_nlpid);
}
static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
}
int drop_caches_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
if (write) {
if (sysctl_drop_caches & 1)
drop_pagecache();
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 0c754e64232b..1cd6d9d3e29a 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,9 @@
config ECRYPT_FS
tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && KEYS && CRYPTO && NET
+ depends on EXPERIMENTAL && KEYS && CRYPTO
+ select CRYPTO_ECB
+ select CRYPTO_CBC
+ select CRYPTO_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b91851f1cda3..fbb6e5eed697 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
crypto_free_blkcipher(crypt_stat->tfm);
if (crypt_stat->hash_tfm)
crypto_free_hash(crypt_stat->hash_tfm);
- mutex_lock(&crypt_stat->keysig_list_mutex);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
}
- mutex_unlock(&crypt_stat->keysig_list_mutex);
memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
}
@@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page)
+ extent_offset), crypt_stat);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
offset, crypt_stat->extent_size);
- if (rc) {
+ if (rc < 0) {
ecryptfs_printk(KERN_ERR, "Error attempting "
"to write lower page; rc = [%d]"
"\n", rc);
goto out;
}
}
+ rc = 0;
out:
if (enc_extent_page) {
kunmap(enc_extent_page);
@@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page)
rc = ecryptfs_read_lower(enc_extent_virt, offset,
crypt_stat->extent_size,
ecryptfs_inode);
- if (rc) {
+ if (rc < 0) {
ecryptfs_printk(KERN_ERR, "Error attempting "
"to read lower page; rc = [%d]"
"\n", rc);
@@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
kfree(full_alg_name);
if (IS_ERR(crypt_stat->tfm)) {
rc = PTR_ERR(crypt_stat->tfm);
+ crypt_stat->tfm = NULL;
ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
"Error initializing cipher [%s]\n",
crypt_stat->cipher);
@@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
struct ecryptfs_global_auth_tok *global_auth_tok;
int rc = 0;
+ mutex_lock(&crypt_stat->keysig_list_mutex);
mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
list_for_each_entry(global_auth_tok,
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
@@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
if (rc) {
printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
- mutex_unlock(
- &mount_crypt_stat->global_auth_tok_list_mutex);
goto out;
}
}
- mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
out:
+ mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+ mutex_unlock(&crypt_stat->keysig_list_mutex);
return rc;
}
@@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data,
crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
ecryptfs_inode);
- if (rc) {
+ if (rc < 0) {
printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
__func__, rc);
goto out;
}
if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
rc = -EINVAL;
- }
+ } else
+ rc = 0;
out:
return rc;
}
@@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
0, virt_len);
- if (rc)
+ if (rc < 0)
printk(KERN_ERR "%s: Error attempting to write header "
- "information to lower file; rc = [%d]\n", __func__,
- rc);
+ "information to lower file; rc = [%d]\n", __func__, rc);
+ else
+ rc = 0;
return rc;
}
@@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
}
rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
ecryptfs_inode);
- if (!rc)
+ if (rc >= 0)
rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
ecryptfs_dentry,
ECRYPTFS_VALIDATE_HEADER_SIZE);
@@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
} else {
printk(KERN_ERR "%s: No support for requested filename "
"encryption method in this release\n", __func__);
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
goto out;
}
out:
@@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
- "[%s]; rc = [%d]\n", cipher_name, rc);
+ "[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
@@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
printk(KERN_ERR "Error attempting to set key of size [%zd] for "
- "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
+ "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
+ rc);
rc = -EINVAL;
goto out;
}
@@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename(
(*encoded_name)[(*encoded_name_size)] = '\0';
(*encoded_name_size)++;
} else {
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
}
if (rc) {
printk(KERN_ERR "%s: Error attempting to encode "
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
extern const struct inode_operations ecryptfs_symlink_iops;
extern const struct super_operations ecryptfs_sops;
extern const struct dentry_operations ecryptfs_dops;
-extern struct address_space_operations ecryptfs_aops;
+extern const struct address_space_operations ecryptfs_aops;
extern int ecryptfs_verbosity;
extern unsigned int ecryptfs_message_buf_len;
extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2f0945d63297..371e92eeb9a5 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
struct dentry *lower_dir_dentry;
+ dget(lower_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
rc = vfs_unlink(lower_dir_inode, lower_dentry);
if (rc) {
@@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
d_drop(dentry);
out_unlock:
unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
return rc;
}
@@ -770,18 +772,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
}
/**
- * ecryptfs_truncate
+ * truncate_upper
* @dentry: The ecryptfs layer dentry
- * @new_length: The length to expand the file to
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
*
* Function to handle truncations modifying the size of the file. Note
* that the file sizes are interpolated. When expanding, we are simply
- * writing strings of 0's out. When truncating, we need to modify the
- * underlying file size according to the page index interpolations.
+ * writing strings of 0's out. When truncating, we truncate the upper
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+ struct iattr *lower_ia)
{
int rc = 0;
struct inode *inode = dentry->d_inode;
@@ -792,7 +799,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
loff_t lower_size_before_truncate;
loff_t lower_size_after_truncate;
- if (unlikely((new_length == i_size)))
+ if (unlikely((ia->ia_size == i_size)))
goto out;
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
/* Set up a fake ecryptfs file, this is used to interface with
@@ -813,28 +820,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
&fake_ecryptfs_file,
ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
/* Switch on growing or shrinking file */
- if (new_length > i_size) {
+ if (ia->ia_size > i_size) {
char zero[] = { 0x00 };
+ lower_ia->ia_valid &= ~ATTR_SIZE;
/* Write a single 0 at the last position of the file;
* this triggers code that will fill in 0's throughout
* the intermediate portion of the previous end of the
* file and the new and of the file */
rc = ecryptfs_write(&fake_ecryptfs_file, zero,
- (new_length - 1), 1);
- } else { /* new_length < i_size_read(inode) */
- /* We're chopping off all the pages down do the page
- * in which new_length is located. Fill in the end of
- * that page from (new_length & ~PAGE_CACHE_MASK) to
+ (ia->ia_size - 1), 1);
+ } else { /* ia->ia_size < i_size_read(inode) */
+ /* We're chopping off all the pages down to the page
+ * in which ia->ia_size is located. Fill in the end of
+ * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
* PAGE_CACHE_SIZE with zeros. */
size_t num_zeros = (PAGE_CACHE_SIZE
- - (new_length & ~PAGE_CACHE_MASK));
+ - (ia->ia_size & ~PAGE_CACHE_MASK));
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = vmtruncate(inode, new_length);
+ rc = vmtruncate(inode, ia->ia_size);
if (rc)
goto out_free;
- rc = vmtruncate(lower_dentry->d_inode, new_length);
+ lower_ia->ia_size = ia->ia_size;
+ lower_ia->ia_valid |= ATTR_SIZE;
goto out_free;
}
if (num_zeros) {
@@ -846,7 +855,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
goto out_free;
}
rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
- new_length, num_zeros);
+ ia->ia_size, num_zeros);
kfree(zeros_virt);
if (rc) {
printk(KERN_ERR "Error attempting to zero out "
@@ -855,7 +864,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
goto out_free;
}
}
- vmtruncate(inode, new_length);
+ vmtruncate(inode, ia->ia_size);
rc = ecryptfs_write_inode_size_to_metadata(inode);
if (rc) {
printk(KERN_ERR "Problem with "
@@ -868,10 +877,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
lower_size_before_truncate =
upper_size_to_lower_size(crypt_stat, i_size);
lower_size_after_truncate =
- upper_size_to_lower_size(crypt_stat, new_length);
- if (lower_size_after_truncate < lower_size_before_truncate)
- vmtruncate(lower_dentry->d_inode,
- lower_size_after_truncate);
+ upper_size_to_lower_size(crypt_stat, ia->ia_size);
+ if (lower_size_after_truncate < lower_size_before_truncate) {
+ lower_ia->ia_size = lower_size_after_truncate;
+ lower_ia->ia_valid |= ATTR_SIZE;
+ } else
+ lower_ia->ia_valid &= ~ATTR_SIZE;
}
out_free:
if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -881,6 +892,33 @@ out:
return rc;
}
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+ struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+ struct iattr lower_ia = { .ia_valid = 0 };
+ int rc;
+
+ rc = truncate_upper(dentry, &ia, &lower_ia);
+ if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = notify_change(lower_dentry, &lower_ia);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ }
+ return rc;
+}
+
static int
ecryptfs_permission(struct inode *inode, int mask)
{
@@ -903,6 +941,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
{
int rc = 0;
struct dentry *lower_dentry;
+ struct iattr lower_ia;
struct inode *inode;
struct inode *lower_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -941,15 +980,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
+ memcpy(&lower_ia, ia, sizeof(lower_ia));
+ if (ia->ia_valid & ATTR_FILE)
+ lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
if (ia->ia_valid & ATTR_SIZE) {
- ecryptfs_printk(KERN_DEBUG,
- "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
- ia->ia_valid, ATTR_SIZE);
- rc = ecryptfs_truncate(dentry, ia->ia_size);
- /* ecryptfs_truncate handles resizing of the lower file */
- ia->ia_valid &= ~ATTR_SIZE;
- ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
- ia->ia_valid);
+ rc = truncate_upper(dentry, ia, &lower_ia);
if (rc < 0)
goto out;
}
@@ -958,11 +993,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
* mode change is for clearing setuid/setgid bits. Allow lower fs
* to interpret this in its own way.
*/
- if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
- ia->ia_valid &= ~ATTR_MODE;
+ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ lower_ia.ia_valid &= ~ATTR_MODE;
mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = notify_change(lower_dentry, ia);
+ rc = notify_change(lower_dentry, &lower_ia);
mutex_unlock(&lower_dentry->d_inode->i_mutex);
out:
fsstack_copy_attr_all(inode, lower_inode, NULL);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 259525c9abb8..a0a7847567e9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig(
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
- (*global_auth_tok) = walker;
+ rc = key_validate(walker->global_auth_tok_key);
+ if (!rc)
+ (*global_auth_tok) = walker;
goto out;
}
}
@@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
}
/* TODO: Support other key modules than passphrase for
* filename encryption */
- BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+ rc = -EOPNOTSUPP;
+ printk(KERN_INFO "%s: Filename encryption only supports "
+ "password tokens\n", __func__);
+ goto out_free_unlock;
+ }
sg_init_one(
&s->hash_sg,
(u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
}
/* TODO: Support other key modules than passphrase for
* filename encryption */
- BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+ rc = -EOPNOTSUPP;
+ printk(KERN_INFO "%s: Filename encryption only supports "
+ "password tokens\n", __func__);
+ goto out_free_unlock;
+ }
rc = crypto_blkcipher_setkey(
s->desc.tfm,
s->auth_tok->token.password.session_key_encryption_key,
@@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
rc = -EINVAL;
goto out_free;
}
- ecryptfs_cipher_code_to_string(crypt_stat->cipher,
- (u16)data[(*packet_size)]);
+ rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+ (u16)data[(*packet_size)]);
+ if (rc)
+ goto out_free;
/* A little extra work to differentiate among the AES key
* sizes; see RFC2440 */
switch(data[(*packet_size)++]) {
@@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->key_size =
(*new_auth_tok)->session_key.encrypted_key_size;
}
- ecryptfs_init_crypt_ctx(crypt_stat);
+ rc = ecryptfs_init_crypt_ctx(crypt_stat);
+ if (rc)
+ goto out_free;
if (unlikely(data[(*packet_size)++] != 0x03)) {
printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
rc = -ENOSYS;
@@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache;
int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
{
struct ecryptfs_key_sig *new_key_sig;
- int rc = 0;
new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
if (!new_key_sig) {
- rc = -ENOMEM;
printk(KERN_ERR
"Error allocating from ecryptfs_key_sig_cache\n");
- goto out;
+ return -ENOMEM;
}
memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
- mutex_lock(&crypt_stat->keysig_list_mutex);
+ /* Caller must hold keysig_list_mutex */
list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
- mutex_unlock(&crypt_stat->keysig_list_mutex);
-out:
- return rc;
+
+ return 0;
}
struct kmem_cache *ecryptfs_global_auth_tok_cache;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c6d7a4d748a0..e14cf7e588db 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
const struct cred *cred)
{
struct ecryptfs_open_req *req;
+ int flags = O_LARGEFILE;
int rc = 0;
/* Corresponding dput() and mntput() are done when the
@@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file,
* destroyed. */
dget(lower_dentry);
mntget(lower_mnt);
- (*lower_file) = dentry_open(lower_dentry, lower_mnt,
- (O_RDWR | O_LARGEFILE), cred);
+ flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
+ (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
if (!IS_ERR(*lower_file))
goto out;
+ if (flags & O_RDONLY) {
+ rc = PTR_ERR((*lower_file));
+ goto out;
+ }
req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
if (!req) {
rc = -ENOMEM;
@@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
__func__);
goto out_unlock;
}
- if (IS_ERR(*req->lower_file)) {
+ if (IS_ERR(*req->lower_file))
rc = PTR_ERR(*req->lower_file);
- dget(lower_dentry);
- mntget(lower_mnt);
- (*lower_file) = dentry_open(lower_dentry, lower_mnt,
- (O_RDONLY | O_LARGEFILE), cred);
- if (IS_ERR(*lower_file)) {
- rc = PTR_ERR(*req->lower_file);
- (*lower_file) = NULL;
- printk(KERN_WARNING "%s: Error attempting privileged "
- "open of lower file with either RW or RO "
- "perms; rc = [%d]. Giving up.\n",
- __func__, rc);
- }
- }
out_unlock:
mutex_unlock(&req->mux);
out_free:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f0aa9883c28..c6ac85d6c701 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
#include <linux/key.h>
#include <linux/parser.h>
#include <linux/fs_stack.h>
+#include <linux/ima.h>
#include "ecryptfs_kernel.h"
/**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
const struct cred *cred = current_cred();
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+ int opened_lower_file = 0;
int rc = 0;
mutex_lock(&inode_info->lower_file_mutex);
@@ -129,15 +131,17 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
rc = ecryptfs_privileged_open(&inode_info->lower_file,
lower_dentry, lower_mnt, cred);
- if (rc || IS_ERR(inode_info->lower_file)) {
+ if (rc) {
printk(KERN_ERR "Error opening lower persistent file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
"rc = [%d]\n", lower_dentry, lower_mnt, rc);
- rc = PTR_ERR(inode_info->lower_file);
inode_info->lower_file = NULL;
- }
+ } else
+ opened_lower_file = 1;
}
mutex_unlock(&inode_info->lower_file_mutex);
+ if (opened_lower_file)
+ ima_counts_get(inode_info->lower_file);
return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..df4ce99d0597 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
sizeof(u64));
kfree(file_size_virt);
- if (rc)
+ if (rc < 0)
printk(KERN_ERR "%s: Error writing file size to header; "
"rc = [%d]\n", __func__, rc);
+ else
+ rc = 0;
out:
return rc;
}
@@ -545,7 +547,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
return rc;
}
-struct address_space_operations ecryptfs_aops = {
+const struct address_space_operations ecryptfs_aops = {
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
.write_begin = ecryptfs_write_begin,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index a137c6ea2fee..0cc4fafd6552 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -34,15 +34,14 @@
*
* Write data to the lower file.
*
- * Returns zero on success; non-zero on error
+ * Returns bytes written on success; less than zero on error
*/
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size)
{
struct ecryptfs_inode_info *inode_info;
- ssize_t octets_written;
mm_segment_t fs_save;
- int rc = 0;
+ ssize_t rc;
inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
mutex_lock(&inode_info->lower_file_mutex);
@@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
inode_info->lower_file->f_pos = offset;
fs_save = get_fs();
set_fs(get_ds());
- octets_written = vfs_write(inode_info->lower_file, data, size,
- &inode_info->lower_file->f_pos);
+ rc = vfs_write(inode_info->lower_file, data, size,
+ &inode_info->lower_file->f_pos);
set_fs(fs_save);
- if (octets_written < 0) {
- printk(KERN_ERR "%s: octets_written = [%td]; "
- "expected [%td]\n", __func__, octets_written, size);
- rc = -EINVAL;
- }
mutex_unlock(&inode_info->lower_file_mutex);
mark_inode_dirty_sync(ecryptfs_inode);
return rc;
@@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+ offset_in_page);
virt = kmap(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
+ if (rc > 0)
+ rc = 0;
kunmap(page_for_lower);
return rc;
}
@@ -229,30 +225,24 @@ out:
* Read @size bytes of data at byte offset @offset from the lower
* inode into memory location @data.
*
- * Returns zero on success; non-zero on error
+ * Returns bytes read on success; 0 on EOF; less than zero on error
*/
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode)
{
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_inode);
- ssize_t octets_read;
mm_segment_t fs_save;
- int rc = 0;
+ ssize_t rc;
mutex_lock(&inode_info->lower_file_mutex);
BUG_ON(!inode_info->lower_file);
inode_info->lower_file->f_pos = offset;
fs_save = get_fs();
set_fs(get_ds());
- octets_read = vfs_read(inode_info->lower_file, data, size,
- &inode_info->lower_file->f_pos);
+ rc = vfs_read(inode_info->lower_file, data, size,
+ &inode_info->lower_file->f_pos);
set_fs(fs_save);
- if (octets_read < 0) {
- printk(KERN_ERR "%s: octets_read = [%td]; "
- "expected [%td]\n", __func__, octets_read, size);
- rc = -EINVAL;
- }
mutex_unlock(&inode_info->lower_file_mutex);
return rc;
}
@@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
virt = kmap(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
+ if (rc > 0)
+ rc = 0;
kunmap(page_for_ecryptfs);
flush_dcache_page(page_for_ecryptfs);
return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 12d649602d3a..b15a43a80ab7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
struct ecryptfs_inode_info *inode_info;
inode_info = ecryptfs_inode_to_private(inode);
- mutex_lock(&inode_info->lower_file_mutex);
if (inode_info->lower_file) {
struct dentry *lower_dentry =
inode_info->lower_file->f_dentry;
@@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
d_drop(lower_dentry);
}
}
- mutex_unlock(&inode_info->lower_file_mutex);
ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 31d12de83a2a..8b47e4200e65 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n)
}
EXPORT_SYMBOL_GPL(eventfd_signal);
+static void eventfd_free_ctx(struct eventfd_ctx *ctx)
+{
+ kfree(ctx);
+}
+
static void eventfd_free(struct kref *kref)
{
struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
- kfree(ctx);
+ eventfd_free_ctx(ctx);
}
/**
@@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
-SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+/**
+ * eventfd_file_create - Creates an eventfd file pointer.
+ * @count: Initial eventfd counter value.
+ * @flags: Flags for the eventfd file.
+ *
+ * This function creates an eventfd file pointer, w/out installing it into
+ * the fd table. This is useful when the eventfd file is used during the
+ * initialization of data structures that require extra setup after the eventfd
+ * creation. So the eventfd creation is split into the file pointer creation
+ * phase, and the file descriptor installation phase.
+ * In this way races with userspace closing the newly installed file descriptor
+ * can be avoided.
+ * Returns an eventfd file pointer, or a proper error pointer.
+ */
+struct file *eventfd_file_create(unsigned int count, int flags)
{
- int fd;
+ struct file *file;
struct eventfd_ctx *ctx;
/* Check the EFD_* constants for consistency. */
@@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
if (flags & ~EFD_FLAGS_SET)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
kref_init(&ctx->kref);
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- /*
- * When we call this, the initialization must be complete, since
- * anon_inode_getfd() will install the fd.
- */
- fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
- flags & EFD_SHARED_FCNTL_FLAGS);
- if (fd < 0)
- kfree(ctx);
+ file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
+ flags & EFD_SHARED_FCNTL_FLAGS);
+ if (IS_ERR(file))
+ eventfd_free_ctx(ctx);
+
+ return file;
+}
+
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = eventfd_file_create(count, flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
}
SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde4..ba112bd4a339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
@@ -55,6 +55,7 @@
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
int suid_dumpable = 0;
/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -622,10 +624,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
/* Move stack pages down in memory. */
if (stack_shift) {
ret = shift_arg_pages(vma, stack_shift);
- if (ret) {
- up_write(&mm->mmap_sem);
- return ret;
- }
+ if (ret)
+ goto out_unlock;
}
#ifdef CONFIG_STACK_GROWSUP
@@ -639,7 +639,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
out_unlock:
up_write(&mm->mmap_sem);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(setup_arg_pages);
@@ -845,6 +845,9 @@ static int de_thread(struct task_struct *tsk)
sig->notify_count = 0;
no_thread_group:
+ if (current->mm)
+ setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+
exit_itimers(sig);
flush_itimer_signals();
@@ -923,7 +926,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
task_lock(tsk);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
- perf_counter_comm(tsk);
+ perf_event_comm(tsk);
}
int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +1000,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* security domain:
*/
if (!get_dumpable(current->mm))
- perf_counter_exit_task(current);
+ perf_event_exit_task(current);
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1354,6 +1357,8 @@ int do_execve(char * filename,
if (retval < 0)
goto out;
+ current->stack_start = current->mm->start_stack;
+
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1388,18 +1393,16 @@ out_ret:
return retval;
}
-int set_binfmt(struct linux_binfmt *new)
+void set_binfmt(struct linux_binfmt *new)
{
- struct linux_binfmt *old = current->binfmt;
+ struct mm_struct *mm = current->mm;
- if (new) {
- if (!try_module_get(new->module))
- return -1;
- }
- current->binfmt = new;
- if (old)
- module_put(old->module);
- return 0;
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+
+ mm->binfmt = new;
+ if (new)
+ __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);
@@ -1723,6 +1726,29 @@ int get_dumpable(struct mm_struct *mm)
return (ret >= 2) ? 2 : ret;
}
+static void wait_for_dump_helpers(struct file *file)
+{
+ struct pipe_inode_info *pipe;
+
+ pipe = file->f_path.dentry->d_inode->i_pipe;
+
+ pipe_lock(pipe);
+ pipe->readers++;
+ pipe->writers--;
+
+ while ((pipe->readers > 1) && (!signal_pending(current))) {
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ pipe_wait(pipe);
+ }
+
+ pipe->readers--;
+ pipe->writers++;
+ pipe_unlock(pipe);
+
+}
+
+
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
@@ -1739,11 +1765,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
char **helper_argv = NULL;
int helper_argc = 0;
- char *delimit;
+ int dump_count = 0;
+ static atomic_t core_dump_count = ATOMIC_INIT(0);
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
@@ -1794,54 +1821,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
lock_kernel();
ispipe = format_corename(corename, signr);
unlock_kernel();
- /*
- * Don't bother to check the RLIMIT_CORE value if core_pattern points
- * to a pipe. Since we're not writing directly to the filesystem
- * RLIMIT_CORE doesn't really apply, as no actual core file will be
- * created unless the pipe reader choses to write out the core file
- * at which point file size limits and permissions will be imposed
- * as it does with any other process
- */
+
if ((!ispipe) && (core_limit < binfmt->min_coredump))
goto fail_unlock;
if (ispipe) {
+ if (core_limit == 0) {
+ /*
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * core_limit of 0 here as a speacial value. Any
+ * non-zero limit gets set to RLIM_INFINITY below, but
+ * a limit of 0 skips the dump. This is a consistent
+ * way to catch recursive crashes. We can still crash
+ * if the core_pattern binary sets RLIM_CORE = !0
+ * but it runs as root, and can do lots of stupid things
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ printk(KERN_WARNING
+ "Process %d(%s) has RLIMIT_CORE set to 0\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Aborting core\n");
+ goto fail_unlock;
+ }
+
+ dump_count = atomic_inc_return(&core_dump_count);
+ if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+ printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Skipping core dump\n");
+ goto fail_dropcount;
+ }
+
helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
- goto fail_unlock;
- }
- /* Terminate the string before the first option */
- delimit = strchr(corename, ' ');
- if (delimit)
- *delimit = '\0';
- delimit = strrchr(helper_argv[0], '/');
- if (delimit)
- delimit++;
- else
- delimit = helper_argv[0];
- if (!strcmp(delimit, current->comm)) {
- printk(KERN_NOTICE "Recursive core dump detected, "
- "aborting\n");
- goto fail_unlock;
+ goto fail_dropcount;
}
core_limit = RLIM_INFINITY;
/* SIGPIPE can happen, but it's just never processed */
- if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
+ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
&file)) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
corename);
- goto fail_unlock;
+ goto fail_dropcount;
}
} else
file = filp_open(corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
if (IS_ERR(file))
- goto fail_unlock;
+ goto fail_dropcount;
inode = file->f_path.dentry->d_inode;
if (inode->i_nlink > 1)
goto close_fail; /* multiple links - don't dump */
@@ -1870,7 +1906,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
if (retval)
current->signal->group_exit_code |= 0x80;
close_fail:
+ if (ispipe && core_pipe_limit)
+ wait_for_dump_helpers(file);
filp_close(file, NULL);
+fail_dropcount:
+ if (dump_count)
+ atomic_dec(&core_dump_count);
fail_unlock:
if (helper_argv)
argv_free(helper_argv);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5ab10c3bbebe..9f500dec3b59 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
}
lock_super(sb);
- lock_kernel();
sbi = sb->s_fs_info;
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
out:
if (or)
osd_end_request(or);
- unlock_kernel();
unlock_super(sb);
kfree(fscb);
return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
int num_pend;
struct exofs_sb_info *sbi = sb->s_fs_info;
- lock_kernel();
-
if (sb->s_dirt)
exofs_write_super(sb);
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
osduld_put_device(sbi->s_dev);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
-
- unlock_kernel();
}
/*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 197c7db583c7..e9e175949a63 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -6,7 +6,7 @@
* and for mapping back from file handles to dentries.
*
* For details on why we do all the strange and hairy things in here
- * take a look at Documentation/filesystems/Exporting.
+ * take a look at Documentation/filesystems/nfs/Exporting.
*/
#include <linux/exportfs.h>
#include <linux/fs.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return error;
}
-static int
+int
ext2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext2_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
#ifdef CONFIG_EXT2_FS_POSIX_ACL
/* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
#else
#include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl NULL
#define ext2_get_acl NULL
#define ext2_set_acl NULL
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9a8a8e27a063..da318b0fa637 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child);
/* super.c */
extern void ext2_error (struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
-extern void ext2_warning (struct super_block *, const char *, const char *, ...)
+extern void ext2_msg(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext2_update_dynamic_rev (struct super_block *sb);
extern void ext2_write_super (struct super_block *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
.fiemap = ext2_fiemap,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..71b032c65a02 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -137,7 +137,8 @@ static int ext2_block_to_path(struct inode *inode,
int final = 0;
if (i_block < 0) {
- ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0");
+ ext2_msg(inode->i_sb, KERN_WARNING,
+ "warning: %s: block < 0", __func__);
} else if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
@@ -157,7 +158,8 @@ static int ext2_block_to_path(struct inode *inode,
offsets[n++] = i_block & (ptrs - 1);
final = ptrs;
} else {
- ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big");
+ ext2_msg(inode->i_sb, KERN_WARNING,
+ "warning: %s: block is too big", __func__);
}
if (boundary)
*boundary = final - 1 - (i_block & (ptrs - 1));
@@ -482,7 +484,7 @@ static int ext2_alloc_branch(struct inode *inode,
unlock_buffer(bh);
mark_buffer_dirty_inode(bh, inode);
/* We used to sync bh here if IS_SYNC(inode).
- * But we now rely upon generic_osync_inode()
+ * But we now rely upon generic_write_sync()
* and b_inode_buffers. But not for directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
@@ -819,6 +821,7 @@ const struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +840,7 @@ const struct address_space_operations ext2_nobh_aops = {
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
+ .error_remove_page = generic_error_remove_page,
};
/*
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 78d9b925fc94..dd7175ce5606 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -70,7 +70,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
if (PTR_ERR(inode) == -ESTALE) {
ext2_error(dir->i_sb, __func__,
"deleted inode referenced: %lu",
- ino);
+ (unsigned long) ino);
return ERR_PTR(-EIO);
} else {
return ERR_CAST(inode);
@@ -400,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
const struct inode_operations ext2_special_inode_operations = {
@@ -411,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1a9ffee47d56..1388802b7803 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function,
}
va_start(args, fmt);
- printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function);
+ printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
vprintk(fmt, args);
printk("\n");
va_end(args);
if (test_opt(sb, ERRORS_PANIC))
- panic("EXT2-fs panic from previous error\n");
+ panic("EXT2-fs: panic from previous error\n");
if (test_opt(sb, ERRORS_RO)) {
- printk("Remounting filesystem read-only\n");
+ ext2_msg(sb, KERN_CRIT,
+ "error: remounting filesystem read-only");
sb->s_flags |= MS_RDONLY;
}
}
-void ext2_warning (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext2_msg(struct super_block *sb, const char *prefix,
+ const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
- printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ",
- sb->s_id, function);
+ printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
vprintk(fmt, args);
printk("\n");
va_end(args);
@@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb)
if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
return;
- ext2_warning(sb, __func__,
- "updating to rev %d because of new feature flag, "
- "running e2fsck is recommended",
+ ext2_msg(sb, KERN_WARNING,
+ "warning: updating to rev %d because of "
+ "new feature flag, running e2fsck is recommended",
EXT2_DYNAMIC_REV);
es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
@@ -419,10 +419,10 @@ static const match_table_t tokens = {
{Opt_err, NULL}
};
-static int parse_options (char * options,
- struct ext2_sb_info *sbi)
+static int parse_options(char *options, struct super_block *sb)
{
- char * p;
+ char *p;
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
substring_t args[MAX_OPT_ARGS];
int option;
@@ -505,7 +505,8 @@ static int parse_options (char * options,
#else
case Opt_user_xattr:
case Opt_nouser_xattr:
- printk("EXT2 (no)user_xattr options not supported\n");
+ ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
+ "not supported");
break;
#endif
#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -518,14 +519,15 @@ static int parse_options (char * options,
#else
case Opt_acl:
case Opt_noacl:
- printk("EXT2 (no)acl options not supported\n");
+ ext2_msg(sb, KERN_INFO,
+ "(no)acl options not supported");
break;
#endif
case Opt_xip:
#ifdef CONFIG_EXT2_FS_XIP
set_opt (sbi->s_mount_opt, XIP);
#else
- printk("EXT2 xip option not supported\n");
+ ext2_msg(sb, KERN_INFO, "xip option not supported");
#endif
break;
@@ -542,19 +544,18 @@ static int parse_options (char * options,
case Opt_quota:
case Opt_usrquota:
case Opt_grpquota:
- printk(KERN_ERR
- "EXT2-fs: quota operations not supported.\n");
-
+ ext2_msg(sb, KERN_INFO,
+ "quota operations not supported");
break;
#endif
case Opt_reservation:
set_opt(sbi->s_mount_opt, RESERVATION);
- printk("reservations ON\n");
+ ext2_msg(sb, KERN_INFO, "reservations ON");
break;
case Opt_noreservation:
clear_opt(sbi->s_mount_opt, RESERVATION);
- printk("reservations OFF\n");
+ ext2_msg(sb, KERN_INFO, "reservations OFF");
break;
case Opt_ignore:
break;
@@ -573,34 +574,40 @@ static int ext2_setup_super (struct super_block * sb,
struct ext2_sb_info *sbi = EXT2_SB(sb);
if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
- printk ("EXT2-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ ext2_msg(sb, KERN_ERR,
+ "error: revision level too high, "
+ "forcing read-only mode");
res = MS_RDONLY;
}
if (read_only)
return res;
if (!(sbi->s_mount_state & EXT2_VALID_FS))
- printk ("EXT2-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
else if ((sbi->s_mount_state & EXT2_ERROR_FS))
- printk ("EXT2-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk ("EXT2-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
- (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk ("EXT2-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
+ (le32_to_cpu(es->s_lastcheck) +
+ le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+ ext2_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
if (!le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext2_write_super(sb);
if (test_opt (sb, DEBUG))
- printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, "
- "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
+ "bpg=%lu, ipg=%lu, mo=%04lx]",
EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
sbi->s_frag_size,
sbi->s_groups_count,
@@ -767,7 +774,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
*/
blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
if (!blocksize) {
- printk ("EXT2-fs: unable to set blocksize\n");
+ ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
goto failed_sbi;
}
@@ -783,7 +790,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logic_sb_block))) {
- printk ("EXT2-fs: unable to read superblock\n");
+ ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
goto failed_sbi;
}
/*
@@ -826,7 +833,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, RESERVATION);
- if (!parse_options ((char *) data, sbi))
+ if (!parse_options((char *) data, sb))
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -840,8 +847,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- printk("EXT2-fs warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended\n");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -849,16 +857,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
*/
features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
if (features) {
- printk("EXT2-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext2_msg(sb, KERN_ERR, "error: couldn't mount because of "
+ "unsupported optional features (%x)",
+ le32_to_cpu(features));
goto failed_mount;
}
if (!(sb->s_flags & MS_RDONLY) &&
(features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
- printk("EXT2-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ le32_to_cpu(features));
goto failed_mount;
}
@@ -866,7 +874,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
if (!silent)
- printk("XIP: Unsupported blocksize\n");
+ ext2_msg(sb, KERN_ERR,
+ "error: unsupported blocksize for xip");
goto failed_mount;
}
@@ -875,7 +884,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n");
+ ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
goto failed_sbi;
}
@@ -883,14 +892,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
offset = (sb_block*BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if(!bh) {
- printk("EXT2-fs: Couldn't read superblock on "
- "2nd try.\n");
+ ext2_msg(sb, KERN_ERR, "error: couldn't read"
+ "superblock on 2nd try");
goto failed_sbi;
}
es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
- printk ("EXT2-fs: Magic mismatch, very weird !\n");
+ ext2_msg(sb, KERN_ERR, "error: magic mismatch");
goto failed_mount;
}
}
@@ -906,7 +915,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
!is_power_of_2(sbi->s_inode_size) ||
(sbi->s_inode_size > blocksize)) {
- printk ("EXT2-fs: unsupported inode size: %d\n",
+ ext2_msg(sb, KERN_ERR,
+ "error: unsupported inode size: %d",
sbi->s_inode_size);
goto failed_mount;
}
@@ -943,29 +953,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_blocksize != bh->b_size) {
if (!silent)
- printk ("VFS: Unsupported blocksize on dev "
- "%s.\n", sb->s_id);
+ ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
goto failed_mount;
}
if (sb->s_blocksize != sbi->s_frag_size) {
- printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n",
+ ext2_msg(sb, KERN_ERR,
+ "error: fragsize %lu != blocksize %lu"
+ "(not supported yet)",
sbi->s_frag_size, sb->s_blocksize);
goto failed_mount;
}
if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
- printk ("EXT2-fs: #blocks per group too big: %lu\n",
+ ext2_msg(sb, KERN_ERR,
+ "error: #blocks per group too big: %lu",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
- printk ("EXT2-fs: #fragments per group too big: %lu\n",
+ ext2_msg(sb, KERN_ERR,
+ "error: #fragments per group too big: %lu",
sbi->s_frags_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
- printk ("EXT2-fs: #inodes per group too big: %lu\n",
+ ext2_msg(sb, KERN_ERR,
+ "error: #inodes per group too big: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
@@ -979,13 +993,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk ("EXT2-fs: not enough memory\n");
+ ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount;
}
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
- printk ("EXT2-fs: not enough memory\n");
+ ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount_group_desc;
}
for (i = 0; i < db_count; i++) {
@@ -994,12 +1008,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->s_group_desc[i]) {
for (j = 0; j < i; j++)
brelse (sbi->s_group_desc[j]);
- printk ("EXT2-fs: unable to read group descriptors\n");
+ ext2_msg(sb, KERN_ERR,
+ "error: unable to read group descriptors");
goto failed_mount_group_desc;
}
}
if (!ext2_check_descriptors (sb)) {
- printk ("EXT2-fs: group descriptors corrupted!\n");
+ ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
@@ -1032,7 +1047,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_count_dirs(sb));
}
if (err) {
- printk(KERN_ERR "EXT2-fs: insufficient memory\n");
+ ext2_msg(sb, KERN_ERR, "error: insufficient memory");
goto failed_mount3;
}
/*
@@ -1048,27 +1063,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
iput(root);
- printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
+ ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
goto failed_mount3;
}
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
iput(root);
- printk(KERN_ERR "EXT2-fs: get root inode failed\n");
+ ext2_msg(sb, KERN_ERR, "error: get root inode failed");
ret = -ENOMEM;
goto failed_mount3;
}
if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
- ext2_warning(sb, __func__,
- "mounting ext3 filesystem as ext2");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: mounting ext3 filesystem as ext2");
ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
return 0;
cantfind_ext2:
if (!silent)
- printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
- sb->s_id);
+ ext2_msg(sb, KERN_ERR,
+ "error: can't find an ext2 filesystem on dev %s.",
+ sb->s_id);
goto failed_mount;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -1121,8 +1137,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
static int ext2_sync_fs(struct super_block *sb, int wait)
{
struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+ struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
lock_kernel();
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ ext2_msg(sb, KERN_ERR,
+ "previous I/O error to superblock detected\n");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+
if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
ext2_debug("setting valid to 0\n");
es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
@@ -1170,7 +1202,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
/*
* Allow the "check" option to be passed as a remount option.
*/
- if (!parse_options (data, sbi)) {
+ if (!parse_options(data, sb)) {
err = -EINVAL;
goto restore_opts;
}
@@ -1182,7 +1214,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
EXT2_MOUNT_XIP if not */
if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
- printk("XIP: Unsupported blocksize\n");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: unsupported blocksize for xip");
err = -EINVAL;
goto restore_opts;
}
@@ -1191,8 +1224,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
(old_mount_opt & EXT2_MOUNT_XIP)) &&
invalidate_inodes(sb)) {
- ext2_warning(sb, __func__, "refusing change of xip flag "
- "with busy inodes while remounting");
+ ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
+ "xip flag with busy inodes while remounting");
sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
}
@@ -1216,9 +1249,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
~EXT2_FEATURE_RO_COMPAT_SUPP);
if (ret) {
- printk("EXT2-fs: %s: couldn't remount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ ext2_msg(sb, KERN_WARNING,
+ "warning: couldn't remount RDWR because of "
+ "unsupported optional features (%x).",
+ le32_to_cpu(ret));
err = -EROFS;
goto restore_opts;
}
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..322a56b2dfb1 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
void **kaddr, unsigned long *pfn)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- struct block_device_operations *ops = bdev->bd_disk->fops;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
sector_t sector;
sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
@@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb)
if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
!sb->s_bdev->bd_disk->fops->direct_access) {
sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
- ext2_warning(sb, __func__,
- "ignoring xip option - not supported by bdev");
+ ext2_msg(sb, KERN_WARNING,
+ "warning: ignoring xip option - "
+ "not supported by bdev");
}
}
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext3_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext3_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext3_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext3_new_inode.
*
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
#ifdef CONFIG_EXT3_FS_POSIX_ACL
/* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
static inline int
ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
-
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext3_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext3_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
-}
-
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
+ .aio_write = generic_file_aio_write,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
.fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..8209f266e9ad 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
*/
#include <linux/time.h>
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
@@ -45,19 +46,21 @@
int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
int ret = 0;
+ tid_t commit_tid;
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
J_ASSERT(ext3_journal_current_handle() == NULL);
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for a proper transaction
+ * to commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -72,20 +75,23 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
goto out;
}
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync)
+ commit_tid = atomic_read(&ei->i_datasync_tid);
+ else
+ commit_tid = atomic_read(&ei->i_sync_tid);
+
+ if (log_start_commit(journal, commit_tid)) {
+ log_wait_commit(journal, commit_tid);
goto out;
+ }
/*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
+ * In case we didn't commit a transaction, we have to flush
+ * disk caches manually so that data really is on persistent
+ * storage
*/
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- ret = sync_inode(inode, &wbc);
- }
+ if (test_opt(inode->i_sb, BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
out:
return ret;
}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..354ed3b47b30 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
@@ -688,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
int err = 0;
struct ext3_block_alloc_info *block_i;
ext3_fsblk_t current_block;
+ struct ext3_inode_info *ei = EXT3_I(inode);
- block_i = EXT3_I(inode)->i_block_alloc_info;
+ block_i = ei->i_block_alloc_info;
/*
* If we're splicing into a [td]indirect block (as opposed to the
* inode) then we need to get write access to the [td]indirect block
@@ -730,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
+ /* ext3_mark_inode_dirty already updated i_sync_tid */
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
/* had we spliced it onto indirect block? */
if (where->bh) {
@@ -1724,6 +1738,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1746,9 +1761,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext3_get_block, NULL);
+ if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -1819,6 +1837,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_writeback_aops = {
@@ -1834,6 +1853,7 @@ static const struct address_space_operations ext3_writeback_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_journalled_aops = {
@@ -1848,6 +1868,7 @@ static const struct address_space_operations ext3_journalled_aops = {
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext3_set_aops(struct inode *inode)
@@ -2072,7 +2093,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
ext3_journal_dirty_metadata(handle, bh);
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2303,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
ext3_free_blocks(handle, inode, nr, 1);
@@ -2736,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
struct ext3_inode_info *ei;
struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+ transaction_t *transaction;
long ret;
int block;
@@ -2813,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ atomic_set(&ei->i_sync_tid, tid);
+ atomic_set(&ei->i_datasync_tid, tid);
+ }
+
if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
/*
@@ -2892,6 +2939,10 @@ static int ext3_do_update_inode(handle_t *handle,
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
+
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT3_STATE_NEW)
@@ -2951,16 +3002,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2983,11 +3038,13 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
ei->i_state &= ~EXT3_STATE_NEW;
+ atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
out_brelse:
brelse (bh);
ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..46433b30e45f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
if (is_handle_aborted(handle))
return;
- printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
- caller, errstr, err_fn);
+ printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
+ caller, errstr, err_fn);
journal_abort_handle(handle);
}
+void ext3_msg(struct super_block *sb, const char *prefix,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -174,12 +186,13 @@ static void ext3_handle_error(struct super_block *sb)
journal_abort(journal, -EIO);
}
if (test_opt (sb, ERRORS_RO)) {
- printk (KERN_CRIT "Remounting filesystem read-only\n");
+ ext3_msg(sb, KERN_CRIT,
+ "error: remounting filesystem read-only");
sb->s_flags |= MS_RDONLY;
}
ext3_commit_super(sb, es, 1);
if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs (device %s): panic forced after error\n",
+ panic("EXT3-fs (%s): panic forced after error\n",
sb->s_id);
}
@@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function,
return;
errstr = ext3_decode_error(sb, errno, nbuf);
- printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
- sb->s_id, function, errstr);
+ ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
ext3_handle_error(sb);
}
@@ -268,21 +280,20 @@ void ext3_abort (struct super_block * sb, const char * function,
{
va_list args;
- printk (KERN_CRIT "ext3_abort called.\n");
-
va_start(args, fmt);
- printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
+ printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
vprintk(fmt, args);
printk("\n");
va_end(args);
if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs panic from previous error\n");
+ panic("EXT3-fs: panic from previous error\n");
if (sb->s_flags & MS_RDONLY)
return;
- printk(KERN_CRIT "Remounting filesystem read-only\n");
+ ext3_msg(sb, KERN_CRIT,
+ "error: remounting filesystem read-only");
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
sb->s_flags |= MS_RDONLY;
EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
@@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function,
va_list args;
va_start(args, fmt);
- printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
+ printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
sb->s_id, function);
vprintk(fmt, args);
printk("\n");
@@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb)
if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
return;
- ext3_warning(sb, __func__,
- "updating to rev %d because of new feature flag, "
- "running e2fsck is recommended",
- EXT3_DYNAMIC_REV);
+ ext3_msg(sb, KERN_WARNING,
+ "warning: updating to rev %d because of "
+ "new feature flag, running e2fsck is recommended",
+ EXT3_DYNAMIC_REV);
es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
@@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb)
/*
* Open the external journal device
*/
-static struct block_device *ext3_blkdev_get(dev_t dev)
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
char b[BDEVNAME_SIZE];
@@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev)
return bdev;
fail:
- printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext3_msg(sb, "error: failed to open journal device %s: %ld",
+ __bdevname(dev, b), PTR_ERR(bdev));
+
return NULL;
}
@@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
{
struct list_head *l;
- printk(KERN_ERR "sb orphan head is %d\n",
+ ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
le32_to_cpu(sbi->s_es->s_last_orphan));
- printk(KERN_ERR "sb_info orphan list:\n");
+ ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
list_for_each(l, &sbi->s_orphan) {
struct inode *inode = orphan_list_entry(l);
- printk(KERN_ERR " "
+ ext3_msg(sb, KERN_ERR, " "
"inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
inode->i_sb->s_id, inode->i_ino, inode,
inode->i_mode, inode->i_nlink,
@@ -466,6 +478,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
return NULL;
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+ atomic_set(&ei->i_datasync_tid, 0);
+ atomic_set(&ei->i_sync_tid, 0);
return &ei->vfs_inode;
}
@@ -634,6 +648,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
+
ext3_show_quota_options(seq, sb);
return 0;
@@ -720,7 +737,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext3_quota_operations = {
+static const struct dquot_operations ext3_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -737,7 +754,7 @@ static struct dquot_operations ext3_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext3_qctl_operations = {
+static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -816,6 +833,7 @@ static const match_table_t tokens = {
{Opt_reservation, "reservation"},
{Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
@@ -843,7 +861,7 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
-static ext3_fsblk_t get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
{
ext3_fsblk_t sb_block;
char *options = (char *) *data;
@@ -854,7 +872,7 @@ static ext3_fsblk_t get_sb_block(void **data)
/*todo: use simple_strtoll with >32bit ext3 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
- printk("EXT3-fs: Invalid sb specification: %s\n",
+ ext3_msg(sb, "error: invalid sb specification: %s",
(char *) *data);
return 1;
}
@@ -954,7 +972,8 @@ static int parse_options (char *options, struct super_block *sb,
#else
case Opt_user_xattr:
case Opt_nouser_xattr:
- printk("EXT3 (no)user_xattr options not supported\n");
+ ext3_msg(sb, KERN_INFO,
+ "(no)user_xattr options not supported");
break;
#endif
#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -967,7 +986,8 @@ static int parse_options (char *options, struct super_block *sb,
#else
case Opt_acl:
case Opt_noacl:
- printk("EXT3 (no)acl options not supported\n");
+ ext3_msg(sb, KERN_INFO,
+ "(no)acl options not supported");
break;
#endif
case Opt_reservation:
@@ -983,16 +1003,16 @@ static int parse_options (char *options, struct super_block *sb,
user to specify an existing inode to be the
journal file. */
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
break;
case Opt_journal_inum:
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
if (match_int(&args[0], &option))
@@ -1001,8 +1021,8 @@ static int parse_options (char *options, struct super_block *sb,
break;
case Opt_journal_dev:
if (is_remount) {
- printk(KERN_ERR "EXT3-fs: cannot specify "
- "journal on remount\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
return 0;
}
if (match_int(&args[0], &option))
@@ -1034,12 +1054,11 @@ static int parse_options (char *options, struct super_block *sb,
if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
== data_opt)
break;
- printk(KERN_ERR
- "EXT3-fs (device %s): Cannot change "
+ ext3_msg(sb, KERN_ERR,
+ "error: cannot change "
"data mode on remount. The filesystem "
"is mounted in data=%s mode and you "
- "try to remount it in data=%s mode.\n",
- sb->s_id,
+ "try to remount it in data=%s mode.",
data_mode_string(sbi->s_mount_opt &
EXT3_MOUNT_DATA_FLAGS),
data_mode_string(data_opt));
@@ -1064,31 +1083,31 @@ static int parse_options (char *options, struct super_block *sb,
set_qf_name:
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
- printk(KERN_ERR
- "EXT3-fs: Cannot change journaled "
- "quota options when quota turned on.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: cannot change journaled "
+ "quota options when quota turned on.");
return 0;
}
qname = match_strdup(&args[0]);
if (!qname) {
- printk(KERN_ERR
- "EXT3-fs: not enough memory for "
- "storing quotafile name.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: not enough memory for "
+ "storing quotafile name.");
return 0;
}
if (sbi->s_qf_names[qtype] &&
strcmp(sbi->s_qf_names[qtype], qname)) {
- printk(KERN_ERR
- "EXT3-fs: %s quota file already "
- "specified.\n", QTYPE2NAME(qtype));
+ ext3_msg(sb, KERN_ERR,
+ "error: %s quota file already "
+ "specified.", QTYPE2NAME(qtype));
kfree(qname);
return 0;
}
sbi->s_qf_names[qtype] = qname;
if (strchr(sbi->s_qf_names[qtype], '/')) {
- printk(KERN_ERR
- "EXT3-fs: quotafile must be on "
- "filesystem root.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: quotafile must be on "
+ "filesystem root.");
kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 0;
@@ -1103,9 +1122,9 @@ set_qf_name:
clear_qf_name:
if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
- printk(KERN_ERR "EXT3-fs: Cannot change "
+ ext3_msg(sb, KERN_ERR, "error: cannot change "
"journaled quota options when "
- "quota turned on.\n");
+ "quota turned on.");
return 0;
}
/*
@@ -1122,9 +1141,9 @@ clear_qf_name:
set_qf_format:
if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
- printk(KERN_ERR "EXT3-fs: Cannot change "
+ ext3_msg(sb, KERN_ERR, "error: cannot change "
"journaled quota options when "
- "quota turned on.\n");
+ "quota turned on.");
return 0;
}
sbi->s_jquota_fmt = qfmt;
@@ -1140,8 +1159,8 @@ set_qf_format:
break;
case Opt_noquota:
if (sb_any_quota_loaded(sb)) {
- printk(KERN_ERR "EXT3-fs: Cannot change quota "
- "options when quota turned on.\n");
+ ext3_msg(sb, KERN_ERR, "error: cannot change "
+ "quota options when quota turned on.");
return 0;
}
clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1152,8 +1171,8 @@ set_qf_format:
case Opt_quota:
case Opt_usrquota:
case Opt_grpquota:
- printk(KERN_ERR
- "EXT3-fs: quota options not supported.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: quota options not supported.");
break;
case Opt_usrjquota:
case Opt_grpjquota:
@@ -1161,9 +1180,9 @@ set_qf_format:
case Opt_offgrpjquota:
case Opt_jqfmt_vfsold:
case Opt_jqfmt_vfsv0:
- printk(KERN_ERR
- "EXT3-fs: journaled quota options not "
- "supported.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: journaled quota options not "
+ "supported.");
break;
case Opt_noquota:
break;
@@ -1183,8 +1202,9 @@ set_qf_format:
break;
case Opt_resize:
if (!is_remount) {
- printk("EXT3-fs: resize option only available "
- "for remount\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: resize option only available "
+ "for remount");
return 0;
}
if (match_int(&args[0], &option) != 0)
@@ -1198,9 +1218,9 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ ext3_msg(sb, KERN_ERR,
+ "error: unrecognized mount option \"%s\" "
+ "or missing value", p);
return 0;
}
}
@@ -1218,21 +1238,21 @@ set_qf_format:
(sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
(sbi->s_qf_names[GRPQUOTA] &&
(sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
- printk(KERN_ERR "EXT3-fs: old and new quota "
- "format mixing.\n");
+ ext3_msg(sb, KERN_ERR, "error: old and new quota "
+ "format mixing.");
return 0;
}
if (!sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journaled quota format "
- "not specified.\n");
+ ext3_msg(sb, KERN_ERR, "error: journaled quota format "
+ "not specified.");
return 0;
}
} else {
if (sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journaled quota format "
+ ext3_msg(sb, KERN_ERR, "error: journaled quota format "
"specified with no journaling "
- "enabled.\n");
+ "enabled.");
return 0;
}
}
@@ -1247,31 +1267,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
int res = 0;
if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
- printk (KERN_ERR "EXT3-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: revision level too high, "
+ "forcing read-only mode");
res = MS_RDONLY;
}
if (read_only)
return res;
if (!(sbi->s_mount_state & EXT3_VALID_FS))
- printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
else if ((sbi->s_mount_state & EXT3_ERROR_FS))
- printk (KERN_WARNING
- "EXT3-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk (KERN_WARNING
- "EXT3-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
(le32_to_cpu(es->s_lastcheck) +
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk (KERN_WARNING
- "EXT3-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
#if 0
/* @@@ We _will_ want to clear the valid bit if we find
inconsistencies, to force a fsck at reboot. But for
@@ -1288,22 +1310,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
ext3_commit_super(sb, es, 1);
if (test_opt(sb, DEBUG))
- printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
- "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
+ "bpg=%lu, ipg=%lu, mo=%04lx]",
sb->s_blocksize,
sbi->s_groups_count,
EXT3_BLOCKS_PER_GROUP(sb),
EXT3_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
char b[BDEVNAME_SIZE];
-
- printk("external journal on %s\n",
+ ext3_msg(sb, KERN_INFO, "using external journal on %s",
bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
} else {
- printk("internal journal\n");
+ ext3_msg(sb, KERN_INFO, "using internal journal");
}
return res;
}
@@ -1397,8 +1417,8 @@ static void ext3_orphan_cleanup (struct super_block * sb,
}
if (bdev_read_only(sb->s_bdev)) {
- printk(KERN_ERR "EXT3-fs: write access "
- "unavailable, skipping orphan cleanup.\n");
+ ext3_msg(sb, KERN_ERR, "error: write access "
+ "unavailable, skipping orphan cleanup.");
return;
}
@@ -1412,8 +1432,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
}
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
- sb->s_id);
+ ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
sb->s_flags &= ~MS_RDONLY;
}
#ifdef CONFIG_QUOTA
@@ -1424,9 +1443,9 @@ static void ext3_orphan_cleanup (struct super_block * sb,
if (EXT3_SB(sb)->s_qf_names[i]) {
int ret = ext3_quota_on_mount(sb, i);
if (ret < 0)
- printk(KERN_ERR
- "EXT3-fs: Cannot turn on journaled "
- "quota: error %d\n", ret);
+ ext3_msg(sb, KERN_ERR,
+ "error: cannot turn on journaled "
+ "quota: %d", ret);
}
}
#endif
@@ -1464,11 +1483,11 @@ static void ext3_orphan_cleanup (struct super_block * sb,
#define PLURAL(x) (x), ((x)==1) ? "" : "s"
if (nr_orphans)
- printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
- sb->s_id, PLURAL(nr_orphans));
+ ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
if (nr_truncates)
- printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
- sb->s_id, PLURAL(nr_truncates));
+ ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
@@ -1552,7 +1571,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
struct ext3_super_block *es = NULL;
struct ext3_sb_info *sbi;
ext3_fsblk_t block;
- ext3_fsblk_t sb_block = get_sb_block(&data);
+ ext3_fsblk_t sb_block = get_sb_block(&data, sb);
ext3_fsblk_t logic_sb_block;
unsigned long offset = 0;
unsigned int journal_inum = 0;
@@ -1588,7 +1607,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
+ ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
goto out_fail;
}
@@ -1604,7 +1623,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logic_sb_block))) {
- printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+ ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
goto out_fail;
}
/*
@@ -1663,9 +1682,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- printk(KERN_WARNING
- "EXT3-fs warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -1673,25 +1692,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
*/
features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
if (features) {
- printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount because of unsupported "
+ "optional features (%x)", le32_to_cpu(features));
goto failed_mount;
}
features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount RDWR because of unsupported "
+ "optional features (%x)", le32_to_cpu(features));
goto failed_mount;
}
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT3_MIN_BLOCK_SIZE ||
blocksize > EXT3_MAX_BLOCK_SIZE) {
- printk(KERN_ERR
- "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
- blocksize, sb->s_id);
+ ext3_msg(sb, KERN_ERR,
+ "error: couldn't mount because of unsupported "
+ "filesystem blocksize %d", blocksize);
goto failed_mount;
}
@@ -1702,30 +1721,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
* than the hardware sectorsize for the machine.
*/
if (blocksize < hblock) {
- printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
- "device blocksize %d.\n", blocksize, hblock);
+ ext3_msg(sb, KERN_ERR,
+ "error: fsblocksize %d too small for "
+ "hardware sectorsize %d", blocksize, hblock);
goto failed_mount;
}
brelse (bh);
if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n",
- blocksize);
+ ext3_msg(sb, KERN_ERR,
+ "error: bad blocksize %d", blocksize);
goto out_fail;
}
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if (!bh) {
- printk(KERN_ERR
- "EXT3-fs: Can't read superblock on 2nd try.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: can't read superblock on 2nd try");
goto failed_mount;
}
es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
- printk (KERN_ERR
- "EXT3-fs: Magic mismatch, very weird !\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: magic mismatch");
goto failed_mount;
}
}
@@ -1741,8 +1761,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
(sbi->s_inode_size > blocksize)) {
- printk (KERN_ERR
- "EXT3-fs: unsupported inode size: %d\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: unsupported inode size: %d",
sbi->s_inode_size);
goto failed_mount;
}
@@ -1750,8 +1770,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
le32_to_cpu(es->s_log_frag_size);
if (blocksize != sbi->s_frag_size) {
- printk(KERN_ERR
- "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: fragsize %lu != blocksize %u (unsupported)",
sbi->s_frag_size, blocksize);
goto failed_mount;
}
@@ -1787,31 +1807,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
}
if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #blocks per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_frags_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #fragments per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: #fragments per group too big: %lu",
sbi->s_frags_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT3-fs: #inodes per group too big: %lu\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: #inodes per group too big: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
if (le32_to_cpu(es->s_blocks_count) >
(sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT3-fs: filesystem on %s:"
- " too large to mount safely\n", sb->s_id);
+ ext3_msg(sb, KERN_ERR,
+ "error: filesystem is too large to mount safely");
if (sizeof(sector_t) < 8)
- printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
- "enabled\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: CONFIG_LBDAF not enabled");
goto failed_mount;
}
@@ -1825,7 +1845,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk (KERN_ERR "EXT3-fs: not enough memory\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: not enough memory");
goto failed_mount;
}
@@ -1835,14 +1856,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
- printk (KERN_ERR "EXT3-fs: "
- "can't read group descriptor %d\n", i);
+ ext3_msg(sb, KERN_ERR,
+ "error: can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
}
if (!ext3_check_descriptors (sb)) {
- printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: group descriptors corrupted");
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
@@ -1860,7 +1882,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
ext3_count_dirs(sb));
}
if (err) {
- printk(KERN_ERR "EXT3-fs: insufficient memory\n");
+ ext3_msg(sb, KERN_ERR, "error: insufficient memory");
goto failed_mount3;
}
@@ -1908,9 +1930,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount3;
} else {
if (!silent)
- printk (KERN_ERR
- "ext3: No journal on filesystem on %s\n",
- sb->s_id);
+ ext3_msg(sb, KERN_ERR,
+ "error: no journal found. "
+ "mounting ext3 over ext2?");
goto failed_mount3;
}
@@ -1932,8 +1954,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
case EXT3_MOUNT_WRITEBACK_DATA:
if (!journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
- printk(KERN_ERR "EXT3-fs: Journal does not support "
- "requested data journaling mode\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: journal does not support "
+ "requested data journaling mode");
goto failed_mount4;
}
default:
@@ -1942,8 +1965,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
- printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
- "its supported only with writeback mode\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: ignoring nobh option - "
+ "it is supported only with writeback mode");
clear_opt(sbi->s_mount_opt, NOBH);
}
}
@@ -1954,18 +1978,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
root = ext3_iget(sb, EXT3_ROOT_INO);
if (IS_ERR(root)) {
- printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+ ext3_msg(sb, KERN_ERR, "error: get root inode failed");
ret = PTR_ERR(root);
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
iput(root);
- printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
+ ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
goto failed_mount4;
}
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
- printk(KERN_ERR "EXT3-fs: get root dentry failed\n");
+ ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
iput(root);
ret = -ENOMEM;
goto failed_mount4;
@@ -1984,9 +2008,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
if (needs_recovery)
- printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+ ext3_msg(sb, KERN_INFO, "recovery complete");
ext3_mark_recovery_complete(sb, es);
- printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+ ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
@@ -1996,7 +2020,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
cantfind_ext3:
if (!silent)
- printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
+ ext3_msg(sb, KERN_INFO,
+ "error: can't find ext3 filesystem on dev %s.",
sb->s_id);
goto failed_mount;
@@ -2064,27 +2089,27 @@ static journal_t *ext3_get_journal(struct super_block *sb,
journal_inode = ext3_iget(sb, journal_inum);
if (IS_ERR(journal_inode)) {
- printk(KERN_ERR "EXT3-fs: no journal found.\n");
+ ext3_msg(sb, KERN_ERR, "error: no journal found");
return NULL;
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
- printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+ ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
return NULL;
}
jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
- printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+ ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
iput(journal_inode);
return NULL;
}
journal = journal_init_inode(journal_inode);
if (!journal) {
- printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
+ ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
iput(journal_inode);
return NULL;
}
@@ -2106,13 +2131,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
struct ext3_super_block * es;
struct block_device *bdev;
- bdev = ext3_blkdev_get(j_dev);
+ bdev = ext3_blkdev_get(j_dev, sb);
if (bdev == NULL)
return NULL;
if (bd_claim(bdev, sb)) {
- printk(KERN_ERR
- "EXT3: failed to claim external journal device.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: failed to claim external journal device");
blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
return NULL;
}
@@ -2120,8 +2145,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
- printk(KERN_ERR
- "EXT3-fs: blocksize too small for journal device.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: blocksize too small for journal device");
goto out_bdev;
}
@@ -2129,8 +2154,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
offset = EXT3_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
if (!(bh = __bread(bdev, sb_block, blocksize))) {
- printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
- "external journal\n");
+ ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
+ "external journal");
goto out_bdev;
}
@@ -2138,14 +2163,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- printk(KERN_ERR "EXT3-fs: external journal has "
- "bad superblock\n");
+ ext3_msg(sb, KERN_ERR, "error: external journal has "
+ "bad superblock");
brelse(bh);
goto out_bdev;
}
if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
- printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+ ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
brelse(bh);
goto out_bdev;
}
@@ -2157,19 +2182,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
journal = journal_init_dev(bdev, sb->s_bdev,
start, len, blocksize);
if (!journal) {
- printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: failed to create device journal");
goto out_bdev;
}
journal->j_private = sb;
ll_rw_block(READ, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
- printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+ ext3_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
- printk(KERN_ERR "EXT3-fs: External journal has more than one "
- "user (unsupported) - %d\n",
+ ext3_msg(sb, KERN_ERR,
+ "error: external journal has more than one "
+ "user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
@@ -2195,8 +2222,8 @@ static int ext3_load_journal(struct super_block *sb,
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- printk(KERN_INFO "EXT3-fs: external journal device major/minor "
- "numbers have changed\n");
+ ext3_msg(sb, KERN_INFO, "external journal device major/minor "
+ "numbers have changed");
journal_dev = new_decode_dev(journal_devnum);
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2211,21 +2238,21 @@ static int ext3_load_journal(struct super_block *sb,
if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT3-fs: INFO: recovery "
- "required on readonly filesystem.\n");
+ ext3_msg(sb, KERN_INFO,
+ "recovery required on readonly filesystem");
if (really_read_only) {
- printk(KERN_ERR "EXT3-fs: write access "
- "unavailable, cannot proceed.\n");
+ ext3_msg(sb, KERN_ERR, "error: write access "
+ "unavailable, cannot proceed");
return -EROFS;
}
- printk (KERN_INFO "EXT3-fs: write access will "
- "be enabled during recovery.\n");
+ ext3_msg(sb, KERN_INFO,
+ "write access will be enabled during recovery");
}
}
if (journal_inum && journal_dev) {
- printk(KERN_ERR "EXT3-fs: filesystem has both journal "
- "and inode journals!\n");
+ ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
+ "and inode journals");
return -EINVAL;
}
@@ -2240,7 +2267,7 @@ static int ext3_load_journal(struct super_block *sb,
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
err = journal_update_format(journal);
if (err) {
- printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+ ext3_msg(sb, KERN_ERR, "error updating journal");
journal_destroy(journal);
return err;
}
@@ -2252,7 +2279,7 @@ static int ext3_load_journal(struct super_block *sb,
err = journal_load(journal);
if (err) {
- printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+ ext3_msg(sb, KERN_ERR, "error loading journal");
journal_destroy(journal);
return err;
}
@@ -2271,16 +2298,17 @@ static int ext3_load_journal(struct super_block *sb,
return 0;
}
-static int ext3_create_journal(struct super_block * sb,
- struct ext3_super_block * es,
+static int ext3_create_journal(struct super_block *sb,
+ struct ext3_super_block *es,
unsigned int journal_inum)
{
journal_t *journal;
int err;
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
- "create journal.\n");
+ ext3_msg(sb, KERN_ERR,
+ "error: readonly filesystem when trying to "
+ "create journal");
return -EROFS;
}
@@ -2288,12 +2316,12 @@ static int ext3_create_journal(struct super_block * sb,
if (!journal)
return -EINVAL;
- printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
+ ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
journal_inum);
err = journal_create(journal);
if (err) {
- printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+ ext3_msg(sb, KERN_ERR, "error creating journal");
journal_destroy(journal);
return -EIO;
}
@@ -2321,7 +2349,18 @@ static int ext3_commit_super(struct super_block *sb,
if (!sbh)
return error;
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
@@ -2363,8 +2402,8 @@ out:
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext3_clear_journal_err(struct super_block * sb,
- struct ext3_super_block * es)
+static void ext3_clear_journal_err(struct super_block *sb,
+ struct ext3_super_block *es)
{
journal_t *journal;
int j_errno;
@@ -2555,10 +2594,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
__le32 ret;
if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
~EXT3_FEATURE_RO_COMPAT_SUPP))) {
- printk(KERN_WARNING "EXT3-fs: %s: couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ ext3_msg(sb, KERN_WARNING,
+ "warning: couldn't remount RDWR "
+ "because of unsupported optional "
+ "features (%x)", le32_to_cpu(ret));
err = -EROFS;
goto restore_opts;
}
@@ -2569,11 +2608,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
* require a full umount/remount for now.
*/
if (es->s_last_orphan) {
- printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+ ext3_msg(sb, KERN_WARNING, "warning: couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
- "umount/remount instead.\n",
- sb->s_id);
+ "umount/remount instead.");
err = -EINVAL;
goto restore_opts;
}
@@ -2673,13 +2711,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
- es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT3_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -2824,9 +2860,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
if (EXT3_SB(sb)->s_qf_names[type]) {
/* Quotafile not of fs root? */
if (path.dentry->d_parent != sb->s_root)
- printk(KERN_WARNING
- "EXT3-fs: Quota file not on filesystem root. "
- "Journaled quota will not work.\n");
+ ext3_msg(sb, KERN_WARNING,
+ "warning: Quota file not on filesystem root. "
+ "Journaled quota will not work.");
}
/*
@@ -2908,8 +2944,9 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
handle_t *handle = journal_current_handle();
if (!handle) {
- printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)"
- " cancelled because transaction is not started.\n",
+ ext3_msg(sb, KERN_WARNING,
+ "warning: quota write (off=%llu, len=%llu)"
+ " cancelled because transaction is not started.",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 545e37c4b91e..387d92d00b97 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -960,6 +960,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
+ error = ext3_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
@@ -985,9 +989,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext3_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext3_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
If unsure, say N.
-config EXT4DEV_COMPAT
- bool "Enable ext4dev compatibility"
- depends on EXT4_FS
- help
- Starting with 2.6.28, the name of the ext4 filesystem was
- renamed from ext4dev to ext4. Unfortunately there are some
- legacy userspace programs (such as klibc's fstype) have
- "ext4dev" hardcoded.
-
- To enable backwards compatibility so that systems that are
- still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
- please arrange to get your userspace programs fixed!
-
config EXT4_FS_XATTR
bool "Ext4 extended attributes"
depends on EXT4_FS
@@ -77,3 +63,12 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext4_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext4_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext4_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
extern int ext4_acl_chmod(struct inode *);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
static inline int
ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..22bc7435d913 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);
+ grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
@@ -499,44 +499,6 @@ error_return:
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle: handle for this transaction
- * @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
- * @metadata: Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count,
- int metadata)
-{
- struct super_block *sb;
- unsigned long dquot_freed_blocks;
-
- /* this isn't the right place to decide whether block is metadata
- * inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- metadata = 1;
-
- /* We need to make sure we don't reuse
- * block released untill the transaction commit.
- * writeback mode have weak data consistency so
- * don't force data as metadata when freeing block
- * for writeback mode.
- */
- if (metadata == 0 && !ext4_should_writeback_data(inode))
- metadata = 1;
-
- sb = inode->i_sb;
-
- ext4_mb_free_blocks(handle, inode, block, count,
- metadata, &dquot_freed_blocks);
- if (dquot_freed_blocks)
- vfs_dq_free_block(inode, dquot_freed_blocks);
- return;
-}
-
-/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
* @nblocks: number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..4df8621ec31c 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
- sbi->s_gdb_count + 1);
+ ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
struct rb_node *n = sbi->system_blks.rb_node;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
(start_blk + count > ext4_blocks_count(sbi->s_es)))
return 0;
while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..4cfc2f0edb3f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,29 +65,37 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
/* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE 1
+#define EXT4_MB_HINT_MERGE 0x0001
/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 2
+#define EXT4_MB_HINT_RESERVED 0x0002
/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 4
+#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
-#define EXT4_MB_HINT_FIRST 8
+#define EXT4_MB_HINT_FIRST 0x0008
/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 16
+#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
-#define EXT4_MB_HINT_DATA 32
+#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC 64
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
/* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC 128
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
/* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY 256
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
/* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL 512
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
/* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED 1024
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
struct ext4_allocation_request {
@@ -112,6 +120,31 @@ struct ext4_allocation_request {
};
/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+#define DIO_AIO_UNWRITTEN 0x1
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished AIO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* unwritten or not */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+} ext4_io_end_t;
+
+/*
* Special inodes numbers
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +284,6 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +321,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -330,7 +364,22 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO 0x0010
+#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA 0x0001
+#define EXT4_FREE_BLOCKS_FORGET 0x0002
/*
* ioctl commands
@@ -386,6 +435,9 @@ struct ext4_mount_options {
#endif
};
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -456,7 +508,6 @@ struct move_extent {
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
-#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -481,8 +532,8 @@ struct move_extent {
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- time->tv_sec >> 32 : 0) |
- ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -490,7 +541,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
if (sizeof(time->tv_sec) > 4)
time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
<< 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -653,6 +704,11 @@ struct ext4_inode_info {
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
+
+ /* completed async DIOs that might need unwritten extents handling */
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
};
/*
@@ -700,6 +756,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -841,6 +898,7 @@ struct ext4_sb_info {
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -923,18 +981,11 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
- /* history to debug policy */
- struct ext4_mb_history *s_mb_history;
- int s_mb_history_cur;
- int s_mb_history_max;
- int s_mb_history_num;
- spinlock_t s_mb_history_lock;
- int s_mb_history_filter;
-
/* stats for buddy allocator */
spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -950,6 +1001,7 @@ struct ext4_sb_info {
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
@@ -960,6 +1012,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1276,8 +1331,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1336,18 +1389,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *);
extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
- ext4_fsblk_t, unsigned long, int, unsigned long *);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
- ext4_grpblk_t add);
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
/* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
@@ -1367,6 +1417,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1378,7 +1429,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1575,15 +1626,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- unsigned short bb_counters[];
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
};
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1645,42 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
ext4_group_t group)
{
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spin_lock(ext4_group_lock_ptr(sb, group));
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
}
static inline void ext4_unlock_group(struct super_block *sb,
@@ -1650,6 +1731,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
#define EXT_BREAK 1
#define EXT_REPEAT 2
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
#define EXT_MAX_BLOCK 0xffffffff
/*
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..b57e5c711b6d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
#include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
+
int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
return err;
}
-int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh)
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ ext4_fsblk_t blocknr)
{
- int err = 0;
+ int err;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_forget(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh,
- handle, err);
+ might_sleep();
+
+ trace_ext4_forget(inode, is_metadata, blocknr);
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %x\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* In the no journal case, we can just do a bforget and return */
+ if (!ext4_handle_valid(handle)) {
+ bforget(bh);
+ return 0;
}
- else
- brelse(bh);
- return err;
-}
-int __ext4_journal_revoke(const char *where, handle_t *handle,
- ext4_fsblk_t blocknr, struct buffer_head *bh)
-{
- int err = 0;
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh,
- handle, err);
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext4_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call jbd2_journal_forget");
+ err = jbd2_journal_forget(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ return err;
+ }
+ return 0;
+ }
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+ err = jbd2_journal_revoke(handle, blocknr, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ ext4_abort(inode->i_sb, __func__,
+ "error %d when attempting revoke", err);
}
- else
- brelse(bh);
+ BUFFER_TRACE(bh, "exit");
return err;
}
@@ -89,7 +125,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- mark_buffer_dirty(bh);
+ if (inode && bh)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..84bc98ab9f0d 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -116,12 +116,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
- * Wrapper functions with which ext4 calls into JBD. The intent here is
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
- * been done yet.
+ * Wrapper functions with which ext4 calls into JBD.
*/
-
void ext4_journal_abort_handle(const char *caller, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
@@ -131,13 +127,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
int __ext4_journal_get_write_access(const char *where, handle_t *handle,
struct buffer_head *bh);
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh);
-
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
- ext4_fsblk_t blocknr, struct buffer_head *bh);
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh);
@@ -149,23 +141,24 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
__ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+ __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+ (block_nr))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__func__, (handle), (bh))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
- if (handle == EXT4_NOJOURNAL_HANDLE)
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..2c4a9321fb14 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
err = ext4_journal_extend(handle, needed);
if (err <= 0)
return err;
- return ext4_journal_restart(handle, needed);
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ /*
+ * We have dropped i_data_sem so someone might have cached again
+ * an extent we are going to truncate.
+ */
+ ext4_ext_invalidate_cache(inode);
+
+ return err;
}
/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
return newblock;
}
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (size > 6)
+ size = 6;
#endif
+ }
return size;
}
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (size > 5)
+ size = 5;
#endif
+ }
return size;
}
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (size > 3)
+ size = 3;
#endif
+ }
return size;
}
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (size > 4)
+ size = 4;
#endif
+ }
return size;
}
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
int lcap, icap, rcap, leafs, idxs, num;
int newextents = blocks;
- rcap = ext4_ext_space_root_idx(inode);
- lcap = ext4_ext_space_block(inode);
- icap = ext4_ext_space_block_idx(inode);
+ rcap = ext4_ext_space_root_idx(inode, 0);
+ lcap = ext4_ext_space_block(inode, 0);
+ icap = ext4_ext_space_block_idx(inode, 0);
/* number of new leaf blocks needed */
num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
if (depth == ext_depth(inode)) {
if (depth == 0)
- max = ext4_ext_space_root(inode);
+ max = ext4_ext_space_root(inode, 1);
else
- max = ext4_ext_space_root_idx(inode);
+ max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
- max = ext4_ext_space_block(inode);
+ max = ext4_ext_space_block(inode, 1);
else
- max = ext4_ext_space_block_idx(inode);
+ max = ext4_ext_space_block_idx(inode, 1);
}
return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:%d:%llu ",
+ ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:%d ",
+ ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
ext4_ext_invalidate_cache(inode);
return 0;
@@ -701,7 +723,7 @@ err:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
path[depth].p_ext++;
while (path[depth].p_ext <=
EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:%d in new leaf %llu\n",
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
+ ext4_ext_is_uninitialized(path[depth].p_ext),
ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
@@ -984,7 +1007,8 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
- ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+ ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+ EXT4_FREE_BLOCKS_METADATA);
}
}
kfree(ablocks);
@@ -1037,9 +1061,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1054,7 +1078,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
goto out;
curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+ curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
curp->p_hdr->eh_entries = cpu_to_le16(1);
curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
@@ -1563,7 +1587,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1579,10 +1603,13 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append %d block to %d:%d (from %llu)\n",
+ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -1651,9 +1678,10 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1691,11 @@ has_space:
len = EXT_MAX_EXTENT(eh) - nearex;
len = (len - 1) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1705,11 @@ has_space:
BUG_ON(newext->ee_block == nearex->ee_block);
len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
@@ -1694,7 +1724,8 @@ has_space:
merge:
/* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -1927,7 +1958,6 @@ errout:
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
- struct buffer_head *bh;
int err;
ext4_fsblk_t leaf;
@@ -1943,9 +1973,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
- bh = sb_find_get_block(inode->i_sb, leaf);
- ext4_forget(handle, 1, inode, bh, leaf);
- ext4_free_blocks(handle, inode, leaf, 1, 1);
+ ext4_free_blocks(handle, inode, 0, leaf, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return err;
}
@@ -2012,12 +2041,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
ext4_lblk_t from, ext4_lblk_t to)
{
- struct buffer_head *bh;
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- int i, metadata = 0;
+ int flags = EXT4_FREE_BLOCKS_FORGET;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- metadata = 1;
+ flags |= EXT4_FREE_BLOCKS_METADATA;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2042,11 +2070,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
num = le32_to_cpu(ex->ee_block) + ee_len - from;
start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %u blocks starting %llu\n", num, start);
- for (i = 0; i < num; i++) {
- bh = sb_find_get_block(inode->i_sb, start + i);
- ext4_forget(handle, 0, inode, bh, start + i);
- }
- ext4_free_blocks(handle, inode, start, num, metadata);
+ ext4_free_blocks(handle, inode, 0, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2094,7 +2118,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
else
uninitialized = 0;
- ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ uninitialized, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
}
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
- err = ext4_ext_journal_restart(handle, credits);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
goto out;
@@ -2327,7 +2352,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (err == 0) {
ext_inode_hdr(inode)->eh_depth = 0;
ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(inode));
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
err = ext4_ext_dirty(handle, inode, path);
}
}
@@ -2349,6 +2374,7 @@ void ext4_ext_init(struct super_block *sb)
*/
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
@@ -2360,6 +2386,7 @@ void ext4_ext_init(struct super_block *sb)
printk(", stats");
#endif
printk("\n");
+#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2461,7 +2488,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
#define EXT4_EXT_ZERO_LEN 7
-
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -2554,7 +2580,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path,
+ ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2610,7 +2637,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2728,7 +2755,192 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zero out the first half */
+ return allocated;
+ } else if (err)
+ goto fix_extent_len;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err ? err : allocated;
+
+fix_extent_len:
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t iblock,
+ unsigned int max_blocks,
+ int flags)
+{
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex1 = NULL;
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+ * If the uninitialized extent begins at the same logical
+ * block where the write begins, and the write completely
+ * covers the extent, then we don't need to split it.
+ */
+ if ((iblock == ee_block) && (allocated <= max_blocks))
+ return allocated;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * for sanity, update the length of the ex2 extent before
+ * we insert ex3, if ex1 is NULL. This is to avoid temporary
+ * overlap of blocks.
+ */
+ if (!ex1 && allocated > max_blocks)
+ ex2->ee_len = cpu_to_le16(max_blocks);
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zeroed the full extent */
+ /* blocks available from iblock */
+ return allocated;
+
+ } else if (err)
+ goto fix_extent_len;
+ /*
+ * The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ /*
+ * update the extent length after successful insert of the
+ * split extent
+ */
+ orig_ex.ee_len = cpu_to_le16(ee_len -
+ ext4_ext_get_actual_len(ex3));
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ allocated = max_blocks;
+ }
+ /*
+ * If there was a change of depth as part of the
+ * insertion of ex3 above, we need to update the length
+ * of the ex1 extent again here
+ */
+ if (ex1 && ex1 != ex) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+ * uninitialised still.
+ */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ ext4_ext_mark_uninitialized(ex2);
+ if (ex2 != ex)
+ goto insert;
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ ext_debug("out here\n");
+ goto out;
+insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2743,6 +2955,7 @@ insert:
} else if (err)
goto fix_extent_len;
out:
+ ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
@@ -2753,7 +2966,147 @@ fix_extent_len:
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ struct ext4_extent_header *eh;
+ int depth;
+ int err = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /*
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if (ex > EXT_FIRST_EXTENT(eh)) {
+ /*
+ * To merge left, pass "ex - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ ex--;
+ }
+ }
+ /*
+ * Try to Merge towards right.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ }
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, struct buffer_head *bh_result,
+ ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+ inode->i_ino, (unsigned long long)iblock, max_blocks,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ /* DIO get_block() before submit the IO, split the extent */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to convertion to written when IO is
+ * completed
+ */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ goto out;
+ }
+ /* async DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ set_buffer_unwritten(bh_result);
+ goto out1;
+ }
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
+map_out:
+ set_buffer_mapped(bh_result);
+out1:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
/*
* Block allocation/map/preallocation routine for extents based files
*
@@ -2784,9 +3137,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
__clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%u requested for inode %u\n",
+ ext_debug("blocks %u/%u requested for inode %lu\n",
iblock, max_blocks, inode->i_ino);
/* check in cache */
@@ -2849,7 +3203,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+ ext_debug("%u fit into %u:%d -> %llu\n", iblock,
ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
@@ -2859,33 +3213,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
- goto out;
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- if (allocated > max_blocks)
- allocated = max_blocks;
- /*
- * We have blocks reserved already. We
- * return allocated blocks so that delalloc
- * won't do block reservation for us. But
- * the buffer head will be unmapped so that
- * a read from the block returns 0s.
- */
- set_buffer_unwritten(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
- goto out2;
- }
-
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
- err = ret;
- goto out2;
- } else
- allocated = ret;
- goto outnew;
+ ret = ext4_ext_handle_uninitialized_extents(handle,
+ inode, iblock, max_blocks, path,
+ flags, allocated, bh_result, newblock);
+ return ret;
}
}
@@ -2950,29 +3281,46 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ /*
+ * io_end structure was created for every async
+ * direct IO write to the middle of the file.
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform convertion when IO is done.
+ */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |=
+ EXT4_STATE_DIO_UNWRITTEN;;
+ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), 0);
+ ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+ ext4_ext_get_actual_len(&newex), 0);
goto out2;
}
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
@@ -3171,6 +3519,64 @@ retry:
}
/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ ext4_lblk_t block;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ block = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - block;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, block, max_blocks);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+/*
* Callback function called for each extent to gather FIEMAP information.
*/
static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..9630583cef28 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -81,56 +78,10 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext4_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext4_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
-static struct vm_operations_struct ext4_file_vm_ops = {
+static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ext4_page_mkwrite,
};
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fallocate = ext4_fallocate,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..a3c25076aef1 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,18 +44,23 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0;
+ int err, ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync);
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ return ret;
/*
* data=writeback:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -74,10 +79,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ if (ext4_should_journal_data(inode))
+ return ext4_force_commit(inode->i_sb);
+
+ if (!journal)
+ ret = sync_mapping_buffers(inode->i_mapping);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
@@ -91,10 +97,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
};
- ret = sync_inode(inode, &wbc);
- if (journal && (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ err = sync_inode(inode, &wbc);
+ if (ret == 0)
+ ret = err;
}
out:
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, ext4_free_inodes_count(sb, gdp), x);
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..d3f99e9e8a31 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -70,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
- int err;
-
- might_sleep();
-
- BUFFER_TRACE(bh, "enter");
-
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
-
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
-
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext4_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call jbd2_journal_forget");
- return ext4_journal_forget(handle, bh);
- }
- return 0;
- }
-
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext4_journal_revoke");
- err = ext4_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_abort(inode->i_sb, __func__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
-}
-
-/*
* Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
*/
@@ -192,11 +141,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ return ret;
}
/*
@@ -341,9 +304,7 @@ static int ext4_block_to_path(struct inode *inode,
int n = 0;
int final = 0;
- if (i_block < 0) {
- ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
+ if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
} else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +512,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
*
* Normally this function find the preferred place for block allocation,
* returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
+ ext4_fsblk_t goal;
+
/*
* XXX need to get goal block from mballoc's data structures
*/
- return ext4_find_near(inode, partial);
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
}
/**
@@ -640,6 +607,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
+ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
@@ -674,6 +643,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
+ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
if (*err && (target == blks)) {
/*
@@ -699,7 +669,7 @@ allocated:
return ret;
failed_out:
for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
return ret;
}
@@ -762,8 +732,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
+ /* Don't brelse(bh) here; it's done in
+ * ext4_journal_forget() below */
unlock_buffer(bh);
- brelse(bh);
goto failed;
}
@@ -794,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return err;
failed:
/* Allocation failed, free what we already allocated */
+ ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, branch[i].bh);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- for (i = 0; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ for (i = n+1; i < indirect_blks; i++)
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
- ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
return err;
}
@@ -880,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
err_out:
for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, where[i].bh);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(where[i-1].key), 1, 0);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+ blks, 0);
return err;
}
@@ -998,7 +979,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- else
+ if (err)
goto cleanup;
set_buffer_new(bh_result);
@@ -1109,22 +1090,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, sector_t logical,
- sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+ sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, "check_block_validity",
+ ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
(unsigned long long) phys, len);
- WARN_ON(1);
return -EIO;
}
return 0;
}
/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages)
+ break;
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -1155,6 +1193,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1170,8 +1211,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system corruption",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1235,8 +1276,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
}
@@ -1252,8 +1292,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system "
+ "corruption after allocation",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1776,11 +1817,11 @@ repeat:
if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ vfs_dq_release_reservation_block(inode, total);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
- vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1863,18 +1904,6 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back
@@ -2084,18 +2113,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_EMERG "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_EMERG "Free/Dirty block details\n");
- printk(KERN_EMERG "free_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_EMERG "dirty_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_EMERG "Block reservation details\n");
- printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_CRIT "Free/Dirty block details\n");
+ printk(KERN_CRIT "free_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_CRIT "dirty_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_CRIT "Block reservation details\n");
+ printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2181,14 +2210,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* writepage and writepages will again try to write
* the same.
*/
- printk(KERN_EMERG "%s block allocation failed for inode %lu "
- "at logical offset %llu with max blocks "
- "%zd with error %d\n",
- __func__, mpd->inode->i_ino,
- (unsigned long long)next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- printk(KERN_EMERG "This should not happen.!! "
- "Data will be lost\n");
+ ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu at "
+ "logical offset %llu with max blocks %zd with "
+ "error %d\n", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_CRIT "This should not happen!! "
+ "Data will be lost\n");
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
@@ -2329,7 +2358,7 @@ static int __mpage_da_writepage(struct page *page,
/*
* Rest of the page in the page_vec
* redirty then and skip then. We will
- * try to to write them again after
+ * try to write them again after
* starting a new transaction
*/
redirty_page_for_writepage(wbc, page);
@@ -2529,7 +2558,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
}
static int __ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc,
unsigned int len)
{
struct address_space *mapping = page->mapping;
@@ -2687,7 +2715,7 @@ static int ext4_writepage(struct page *page,
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- return __ext4_journalled_writepage(page, wbc, len);
+ return __ext4_journalled_writepage(page, len);
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2717,7 +2745,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2735,8 +2763,11 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0, nr_to_writebump = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
trace_ext4_da_writepages(inode, wbc);
@@ -2762,16 +2793,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- /*
- * Make sure nr_to_write is >= sbi->s_mb_stream_request
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
- wbc->nr_to_write = sbi->s_mb_stream_request;
- }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2786,6 +2807,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
mpd.wbc = wbc;
mpd.inode = mapping->host;
@@ -2813,10 +2864,9 @@ retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_CRIT "%s: jbd2_start: "
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- dump_stack();
goto out_writepages;
}
@@ -2850,6 +2900,7 @@ retry:
mpd.io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
+ trace_ext4_da_write_pages(inode, &mpd);
wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2887,9 +2938,10 @@ retry:
goto retry;
}
if (pages_skipped != wbc->pages_skipped)
- printk(KERN_EMERG "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
- __func__, wbc->nr_to_write, ret);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
@@ -2904,7 +2956,9 @@ retry:
out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
- wbc->nr_to_write -= nr_to_writebump;
+ if (wbc->nr_to_write > nr_to_writebump)
+ wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -3117,6 +3171,8 @@ out:
*/
int ext4_alloc_da_blocks(struct inode *inode)
{
+ trace_ext4_alloc_da_blocks(inode);
+
if (!EXT4_I(inode)->i_reserved_data_blocks &&
!EXT4_I(inode)->i_reserved_meta_blocks)
return 0;
@@ -3259,6 +3315,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3267,7 +3325,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3278,6 +3336,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -3300,9 +3359,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -3341,6 +3403,364 @@ out:
return ret;
}
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write to
+ * to the middle of file (<i_size)
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ iput(io->inode);
+ kfree(io);
+}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != DIO_AIO_UNWRITTEN)
+ return ret;
+
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ int ret = 0;
+ int ret2 = 0;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ return ret;
+
+ dump_aio_dio_list(inode);
+ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_aio_dio_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && (EXT4_I(inode)->i_state &
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ }
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3373,6 +3793,7 @@ static const struct address_space_operations ext4_ordered_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_writeback_aops = {
@@ -3388,6 +3809,7 @@ static const struct address_space_operations ext4_writeback_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_journalled_aops = {
@@ -3402,6 +3824,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_da_aops = {
@@ -3418,6 +3841,7 @@ static const struct address_space_operations ext4_da_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext4_set_aops(struct inode *inode)
@@ -3653,40 +4077,29 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext4_journal_get_write_access(handle, bh);
}
}
- /*
- * Any buffers which are on the journal will be in memory. We
- * find them on the hash table so jbd2_journal_revoke() will
- * run jbd2_journal_forget() on them. We've already detached
- * each block from the file, so bforget() in
- * jbd2_journal_forget() should be safe.
- *
- * AKPM: turn on bforget in jbd2_journal_forget()!!!
- */
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *tbh;
+ for (p = first; p < last; p++)
+ *p = 0;
- *p = 0;
- tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
- }
- }
-
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
+ ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
}
/**
@@ -3870,10 +4283,12 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
}
- ext4_free_blocks(handle, inode, nr, 1, 1);
+ ext4_free_blocks(handle, inode, 0, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA);
if (parent_bh) {
/*
@@ -3958,8 +4373,7 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (ei->i_disksize && inode->i_size == 0 &&
- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4313,7 +4727,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
long ret;
int block;
@@ -4325,11 +4738,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
+ iloc.bh = 0;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4352,7 +4765,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -4384,7 +4796,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -4416,10 +4827,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- EXT4_SB(sb)->s_gdb_count)) ||
- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
ext4_error(sb, __func__,
"bad extended attribute block %llu in inode #%lu",
ei->i_file_acl, inode->i_ino);
@@ -4437,10 +4845,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
- if (ret) {
- brelse(bh);
+ if (ret)
goto bad_inode;
- }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -4468,7 +4874,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
- brelse(bh);
ret = -EIO;
ext4_error(inode->i_sb, __func__,
"bogus i_mode (%o) for inode=%lu",
@@ -4481,6 +4886,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -4581,8 +4987,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_inode_blocks_set(handle, raw_inode, ei))
goto out_brelse;
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- /* clear the migrate flag in the raw_inode */
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_HURD))
raw_inode->i_file_acl_high =
@@ -4684,19 +5089,40 @@ out_brelse:
*/
int ext4_write_inode(struct inode *inode, int wait)
{
+ int err;
+
if (current->flags & PF_MEMALLOC)
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
- if (!wait)
- return 0;
+ if (!wait)
+ return 0;
- return ext4_force_commit(inode->i_sb);
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
+
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+ if (wait)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
+ }
+ return err;
}
/*
@@ -5134,27 +5560,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
*/
void ext4_dirty_inode(struct inode *inode)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- if (!ext4_handle_valid(current_handle)) {
- ext4_mark_inode_dirty(current_handle, inode);
- return;
- }
-
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
@@ -5281,12 +5694,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
len = PAGE_CACHE_SIZE;
+ lock_page(page);
+ /*
+ * return if we have all the buffers mapped. This avoid
+ * the need to call write_begin/write_end which does a
+ * journal_start/journal_stop which can block and take
+ * long time
+ */
if (page_has_buffers(page)) {
- /* return if we have all the buffers mapped */
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped))
+ ext4_bh_unmapped)) {
+ unlock_page(page);
goto out_unlock;
+ }
}
+ unlock_page(page);
/*
* OK, we need to fill the hole... Do write_begin write_end
* to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..31e5ee0c858f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -239,14 +239,14 @@ setversion_out:
}
}
+ me.moved_len = 0;
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
fput(donor_filp);
- if (!err)
- if (copy_to_user((struct move_extent *)arg,
- &me, sizeof(me)))
- return -EFAULT;
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ return -EFAULT;
+
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..ab2dad1dfb7e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
*/
#include "mballoc.h"
+#include <linux/debugfs.h>
#include <trace/events/ext4.h>
/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
- void *buddy, unsigned first, int len,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned short min;
- unsigned short max;
- unsigned short chunk;
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
unsigned short border;
BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
- unsigned short i = 0;
- unsigned short first;
- unsigned short len;
+ ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
char *data;
char *bitmap;
- mb_debug("init page %lu\n", page->index);
+ mb_debug(1, "init page %lu\n", page->index);
inode = page->mapping->host;
sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
set_bitmap_uptodate(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", first_group + i);
}
/* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug("put buddy for group %u in page %lu/%x\n",
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
- sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
/*
* incore got set to the group block bitmap below
*/
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug("put bitmap for group %u in page %lu/%x\n",
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
/* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
return err;
}
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret = 0;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug(1, "init group %u\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have taken the alloc_sem lock.
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %u\n", group);
+ mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* groups mapped by the page is blocked
* till we are done with allocation
*/
+repeat_load_buddy:
down_read(e4b->alloc_semp);
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /* we need to check for group need init flag
+ * with alloc_semp held so that we can be sure
+ * that new blocks didn't get added to the group
+ * when we are loading the buddy cache
+ */
+ up_read(e4b->alloc_semp);
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ goto repeat_load_buddy;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
}
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
- int ret;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
- struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
-
- mb_debug("init group %lu\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- this_grp = ext4_get_group_info(sb, group);
- /*
- * This ensures we don't add group
- * to this buddy cache via resize
- */
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
- /*
- * somebody initialized the group
- * return without doing anything
- */
- ret = 0;
- goto err;
- }
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
-
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
- /*
- * If both the bitmap and buddy are in
- * the same page we don't need to force
- * init the buddy
- */
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
-err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
- return ret;
-}
-
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
bsbits = ac->ac_sb->s_blocksize_bits;
- /* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
+
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2015,27 +2037,6 @@ repeat:
if (grp->bb_free == 0)
continue;
- /*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
- */
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_init_group(sb, group);
- if (err)
- goto out;
- }
-
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
- if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -2095,207 +2096,6 @@ out:
return err;
}
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
- struct ext4_mb_history *history;
- struct super_block *sb;
- int start;
- int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
- struct ext4_mb_history *hs,
- int first)
-{
- if (hs == s->history + s->max)
- hs = s->history;
- if (!first && hs == s->history + s->start)
- return NULL;
- while (hs->orig.fe_len == 0) {
- hs++;
- if (hs == s->history + s->max)
- hs = s->history;
- if (hs == s->history + s->start)
- return NULL;
- }
- return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- if (!hs)
- return NULL;
- while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
- return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
- loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- else
- return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
- char buf[25], buf2[25], buf3[25], *fmt;
- struct ext4_mb_history *hs = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-5s %-5s %-5s %-6s\n",
- "pid", "inode", "original", "goal", "result", "found",
- "grps", "cr", "flags", "merge", "tail", "broken");
- return 0;
- }
-
- if (hs->op == EXT4_MB_HISTORY_ALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "%-5u %-5s %-5u %-6u\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
- hs->goal.fe_start, hs->goal.fe_len,
- hs->goal.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
- hs->found, hs->groups, hs->cr, hs->flags,
- hs->merged ? "M" : "", hs->tail,
- hs->buddy ? 1 << hs->buddy : 0);
- } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
- } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s discard\n",
- hs->pid, hs->ino, buf2);
- } else if (hs->op == EXT4_MB_HISTORY_FREE) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s free\n",
- hs->pid, hs->ino, buf2);
- }
- return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static struct seq_operations ext4_mb_seq_history_ops = {
- .start = ext4_mb_seq_history_start,
- .next = ext4_mb_seq_history_next,
- .stop = ext4_mb_seq_history_stop,
- .show = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
- struct super_block *sb = PDE(inode)->data;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_mb_proc_session *s;
- int rc;
- int size;
-
- if (unlikely(sbi->s_mb_history == NULL))
- return -ENOMEM;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- s->sb = sb;
- size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
- s->history = kmalloc(size, GFP_KERNEL);
- if (s->history == NULL) {
- kfree(s);
- return -ENOMEM;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(s->history, sbi->s_mb_history, size);
- s->max = sbi->s_mb_history_max;
- s->start = sbi->s_mb_history_cur % s->max;
- spin_unlock(&sbi->s_mb_history_lock);
-
- rc = seq_open(file, &ext4_mb_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = s;
- } else {
- kfree(s->history);
- kfree(s);
- }
- return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- kfree(s->history);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- struct super_block *sb = s->sb;
- char str[32];
- int value;
-
- if (count >= sizeof(str)) {
- printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
- "mb_history", (int)sizeof(str));
- return -EOVERFLOW;
- }
-
- if (copy_from_user(str, buffer, count))
- return -EFAULT;
-
- value = simple_strtol(str, NULL, 0);
- if (value < 0)
- return -ERANGE;
- EXT4_SB(sb)->s_mb_history_filter = value;
-
- return count;
-}
-
-static struct file_operations ext4_mb_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = ext4_mb_seq_history_open,
- .read = seq_read,
- .write = ext4_mb_seq_history_write,
- .llseek = seq_lseek,
- .release = ext4_mb_seq_history_release,
-};
-
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
@@ -2328,7 +2128,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_buddy e4b;
struct sg {
struct ext4_group_info info;
- unsigned short counters[16];
+ ext4_grpblk_t counters[16];
} sg;
group--;
@@ -2366,7 +2166,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2187,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2395,82 +2195,6 @@ static struct file_operations ext4_mb_seq_groups_fops = {
.release = seq_release,
};
-static void ext4_mb_history_release(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_proc != NULL) {
- remove_proc_entry("mb_groups", sbi->s_proc);
- if (sbi->s_mb_history_max)
- remove_proc_entry("mb_history", sbi->s_proc);
- }
- kfree(sbi->s_mb_history);
-}
-
-static void ext4_mb_history_init(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
-
- if (sbi->s_proc != NULL) {
- if (sbi->s_mb_history_max)
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
- }
-
- sbi->s_mb_history_cur = 0;
- spin_lock_init(&sbi->s_mb_history_lock);
- i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
- /* if we can't allocate history, then we simple won't use it */
-}
-
-static noinline_for_stack void
-ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_mb_history h;
-
- if (sbi->s_mb_history == NULL)
- return;
-
- if (!(ac->ac_op & sbi->s_mb_history_filter))
- return;
-
- h.op = ac->ac_op;
- h.pid = current->pid;
- h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
- h.orig = ac->ac_o_ex;
- h.result = ac->ac_b_ex;
- h.flags = ac->ac_flags;
- h.found = ac->ac_found;
- h.groups = ac->ac_groups_scanned;
- h.cr = ac->ac_criteria;
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
- h.merged = 0;
- if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
- if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
- ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
- h.merged = 1;
- h.goal = ac->ac_g_ex;
- h.result = ac->ac_f_ex;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
- if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
- sbi->s_mb_history_cur = 0;
- spin_unlock(&sbi->s_mb_history_lock);
-}
-
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
-
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2532,7 +2256,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root.rb_node = NULL;;
+ meta_group_info[i]->bb_free_root.rb_node = NULL;
#ifdef DOUBLE_CHECK
{
@@ -2558,26 +2282,15 @@ exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
- grp->bb_free += add;
-}
-
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
- struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
@@ -2622,22 +2335,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freesgi;
}
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (ngroups -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
@@ -2655,7 +2352,6 @@ err_freebuddy:
while (i-- > 0)
kfree(ext4_get_group_info(sb, i));
i = num_meta_group_infos;
-err_freemeta:
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2672,14 +2368,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_offsets);
@@ -2717,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2735,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_history_init(sb);
+ if (sbi->s_proc)
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
- printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2758,7 +2453,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
- mb_debug("mballoc: %u PAs left\n", count);
+ mb_debug(1, "mballoc: %u PAs left\n", count);
}
@@ -2817,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
- ext4_mb_history_release(sb);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
@@ -2833,13 +2529,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
- ext4_fsblk_t discard_block;
struct list_head *l, *ltmp;
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
- mb_debug("gonna free %u blocks in group %u (0x%p):",
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2863,20 +2558,60 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
page_cache_release(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->group);
- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
- + entry->start_blk
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
- entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
-
+ if (test_opt(sb, DISCARD)) {
+ ext4_fsblk_t discard_block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ discard_block = (ext4_fsblk_t)entry->group *
+ EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(es->s_first_data_block);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+ }
kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
}
- mb_debug("freed %u blocks in %u structures\n", count, count2);
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+ debugfs_dir = debugfs_create_dir("ext4", NULL);
+ if (debugfs_dir)
+ debugfs_debug = debugfs_create_u8("mballoc-debug",
+ S_IRUGO | S_IWUSR,
+ debugfs_dir,
+ &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+ debugfs_remove(debugfs_debug);
+ debugfs_remove(debugfs_dir);
+}
+
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2904,6 +2639,7 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
+ ext4_create_debugfs_entry();
return 0;
}
@@ -2917,6 +2653,7 @@ void exit_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_ext_cachep);
+ ext4_remove_debugfs_entry();
}
@@ -3061,7 +2798,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
else
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug("#%u: goal %u blocks for locality group\n",
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3180,23 +2917,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
ac->ac_o_ex.fe_logical < pa->pa_lstart));
- /* skip PA normalized request doesn't overlap with */
- if (pa->pa_lstart >= end) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- if (pa_end <= start) {
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
spin_unlock(&pa->pa_lock);
continue;
}
BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+ /* adjust start or end to be adjacent to this pa */
if (pa_end <= ac->ac_o_ex.fe_logical) {
BUG_ON(pa_end < start);
start = pa_end;
- }
-
- if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
BUG_ON(pa->pa_lstart > end);
end = pa->pa_lstart;
}
@@ -3251,7 +2983,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3272,7 +3004,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_inc(&sbi->s_bal_breaks);
}
- ext4_mb_store_history(ac);
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+ trace_ext4_mballoc_alloc(ac);
+ else
+ trace_ext4_mballoc_prealloc(ac);
}
/*
@@ -3300,7 +3035,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
}
/*
@@ -3324,7 +3059,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
}
/*
@@ -3382,6 +3117,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
continue;
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
/* found preallocated blocks, use them */
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3243,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
preallocated += len;
count++;
}
- mb_debug("prellocated %u for group %u\n", preallocated, group);
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3378,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
@@ -3698,7 +3438,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
@@ -3767,7 +3507,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (ac) {
ac->ac_sb = sb;
ac->ac_inode = pa->pa_inode;
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
}
while (bit < end) {
@@ -3777,7 +3516,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
- mb_debug(" free preallocated %u/%u in group %u\n",
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
(unsigned) start, (unsigned) next - bit,
(unsigned) group);
free += next - bit;
@@ -3787,7 +3526,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = next - bit;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3822,9 +3561,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- if (ac)
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
trace_ext4_mb_release_group_pa(ac, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3839,7 +3575,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = pa->pa_len;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
return 0;
@@ -3868,7 +3604,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %u\n", group);
+ mb_debug(1, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
@@ -3992,7 +3728,7 @@ void ext4_discard_preallocations(struct inode *inode)
return;
}
- mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
@@ -4097,7 +3833,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
{
BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -4139,14 +3875,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
NULL, &start);
spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%lu:%d:%u \n", i,
- start, pa->pa_len);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
}
ext4_unlock_group(sb, i);
if (grp->bb_free == 0)
continue;
- printk(KERN_ERR "%lu: %d/%d \n",
+ printk(KERN_ERR "%u: %d/%d \n",
i, grp->bb_free, grp->bb_fragments);
}
printk(KERN_ERR "\n");
@@ -4174,16 +3910,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- size = max(size, isize);
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4246,7 +3992,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4014,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
- mb_debug("discard locality group preallocation\n");
+ mb_debug(1, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
@@ -4681,18 +4427,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
return 0;
}
-/*
- * Main entry point into mballoc to free blocks
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ * @metadata: Are these metadata blocks
*/
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count,
- int metadata, unsigned long *freed)
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_allocation_context *ac = NULL;
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
+ unsigned long freed = 0;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
@@ -4702,13 +4454,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
- *freed = 0;
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
+ if (!ext4_data_block_valid(sbi, block, count)) {
ext4_error(sb, __func__,
"Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
@@ -4716,11 +4471,35 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}
ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, metadata);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (flags & EXT4_FREE_BLOCKS_FORGET) {
+ struct buffer_head *tbh = bh;
+ int i;
+
+ BUG_ON(bh && (count > 1));
+
+ for (i = 0; i < count; i++) {
+ if (!bh)
+ tbh = sb_find_get_block(inode->i_sb,
+ block + i);
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, tbh, block + i);
+ }
+ }
+
+ /*
+ * We need to make sure we don't reuse the freed block until
+ * after the transaction is committed, which we can do by
+ * treating the block as metadata, below. We make an
+ * exception if the inode is to be written in writeback mode
+ * since writeback mode has weak data consistency guarantees.
+ */
+ if (!ext4_should_writeback_data(inode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
- ac->ac_op = EXT4_MB_HISTORY_FREE;
ac->ac_inode = inode;
ac->ac_sb = sb;
}
@@ -4787,13 +4566,14 @@ do_more:
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = count;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_free(ac);
}
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
goto error_return;
- if (metadata && ext4_handle_valid(handle)) {
+
+ if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
struct ext4_free_data *new_entry;
/*
* blocks being freed are metadata. these blocks shouldn't
@@ -4832,7 +4612,7 @@ do_more:
ext4_mb_release_desc(&e4b);
- *freed += count;
+ freed += count;
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4852,6 +4632,8 @@ do_more:
}
sb->s_dirt = 1;
error_return:
+ if (freed)
+ vfs_dq_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,25 +37,23 @@
/*
*/
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...) printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= mb_enable_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
#else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
#endif
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE 8 /* free */
-
-#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
- EXT4_MB_HISTORY_PREALLOC)
/*
* How long mballoc can look for a best extent (in found extents)
@@ -76,7 +74,7 @@
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
-#define MB_DEFAULT_STATS 1
+#define MB_DEFAULT_STATS 0
/*
* files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -128,8 +126,8 @@ struct ext4_prealloc_space {
unsigned pa_deleted;
ext4_fsblk_t pa_pstart; /* phys. block */
ext4_lblk_t pa_lstart; /* log. block */
- unsigned short pa_len; /* len of preallocated chunk */
- unsigned short pa_free; /* how many blocks are free */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +142,7 @@ struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start;
ext4_group_t fe_group;
- int fe_len;
+ ext4_grpblk_t fe_len;
};
/*
@@ -209,22 +207,6 @@ struct ext4_allocation_context {
#define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3
-struct ext4_mb_history {
- struct ext4_free_extent orig; /* orig allocation */
- struct ext4_free_extent goal; /* goal allocation */
- struct ext4_free_extent result; /* result allocation */
- unsigned pid;
- unsigned ino;
- __u16 found; /* how many extents have been found */
- __u16 groups; /* how many groups have been scanned */
- __u16 tail; /* what tail broke some buddy */
- __u16 buddy; /* buddy the tail ^^^ broke */
- __u16 flags;
- __u8 cr:3; /* which phase the result extent was found at */
- __u8 op:4;
- __u8 merged:1;
-};
-
struct ext4_buddy {
struct page *bd_buddy_page;
void *bd_buddy;
@@ -239,13 +221,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- return;
-}
-#endif
-
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..d641e13e740e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(tmp_idata[i]), 1, 1);
+ ext4_free_blocks(handle, inode, 0,
+ le32_to_cpu(tmp_idata[i]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(i_data[0]), 1, 1);
+ ext4_free_blocks(handle, inode, 0,
+ le32_to_cpu(i_data[0]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
/* ei->i_data[EXT4_DIND_BLOCK] */
@@ -353,17 +361,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if EXT4_EXT_MIGRATE is cleared a block allocation
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -420,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, block, 1, 1);
+ ext4_free_blocks(handle, inode, 0, block, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return retval;
}
@@ -517,14 +525,15 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block allocation
- * via mmap write to holes. If we have allocated new blocks we fail
- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
- * The flag is updated with i_data_sem held to prevent racing with
- * block allocation.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
@@ -618,7 +627,7 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
-
+ unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..cad1e2edda7e 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
#include "ext4_extents.h"
#include "ext4.h"
-#define get_ext_path(path, inode, block, ret) \
- do { \
- path = ext4_ext_find_extent(inode, block, path); \
- if (IS_ERR(path)) { \
- ret = PTR_ERR(path); \
- path = NULL; \
- } \
- } while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
/**
* copy_extent_status - Copy the extent's initialization status
@@ -60,12 +77,14 @@ static int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
+ struct ext4_extent_header *eh;
int ppos, leaf_ppos = path->p_depth;
ppos = leaf_ppos;
if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
/* leaf block */
*extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
return 0;
}
@@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
ext_block_hdr(path[cur_ppos+1].p_bh);
}
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
/* leaf block */
path[leaf_ppos].p_ext = *extent =
EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext_pblock(path[leaf_ppos].p_ext);
return 0;
}
}
@@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * mext_check_null_inode - NULL check for two inodes
*
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function)
{
- struct inode *first = orig_inode, *second = donor_inode;
-
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ ext4_error(inode2->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ ext4_error(inode1->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
}
-
- down_read(&EXT4_I(first)->i_data_sem);
- down_read(&EXT4_I(second)->i_data_sem);
+ return ret;
}
/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
*/
static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
}
down_write(&EXT4_I(first)->i_data_sem);
- down_write(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_up_read - Release two inodes' read semaphore
- *
- * @orig_inode: original inode structure to be released its lock first
- * @donor_inode: donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
- up_read(&EXT4_I(orig_inode)->i_data_sem);
- up_read(&EXT4_I(donor_inode)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
- * mext_double_up_write - Release two inodes' write semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (new_flag) {
- get_ext_path(orig_path, orig_inode, eblock, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext))
+ orig_path, new_ext, 0))
goto out;
}
if (end_flag) {
- get_ext_path(orig_path, orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext))
+ orig_path, end_ext, 0))
goto out;
}
out:
@@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* oext |-----------|
* new_ext |-------|
*/
- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
/*
* Case: new_ext is smaller than original extent
@@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
o_end, &start_ext, &new_ext, &end_ext);
+out:
return ret;
}
@@ -553,9 +568,11 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* @tmp_oext: the extent that will belong to the donor inode
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
- * @max_count: the maximun length of extents
+ * @max_count: the maximum length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
*/
-static void
+static int
mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
ext4_lblk_t diff, orig_diff;
struct ext4_extent dext_old, oext_old;
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
dext_old = *tmp_dext;
oext_old = *tmp_oext;
@@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
copy_extent_status(&oext_old, tmp_dext);
copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
}
/**
@@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @donor_inode: donor inode
* @from: block offset of orig_inode
* @count: block count to be replaced
+ * @err: pointer to save return value
*
* Replace original inode extents and donor inode extents page by page.
* We implement this replacement in the following three steps:
@@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* 3. Change the block information of donor inode to point at the saved
* original inode blocks in the dummy extents.
*
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
*/
static int
mext_replace_branches(handle_t *handle, struct inode *orig_inode,
struct inode *donor_inode, ext4_lblk_t from,
- ext4_lblk_t count)
+ ext4_lblk_t count, int *err)
{
struct ext4_ext_path *orig_path = NULL;
struct ext4_ext_path *donor_path = NULL;
struct ext4_extent *oext, *dext;
struct ext4_extent tmp_dext, tmp_oext;
ext4_lblk_t orig_off = from, donor_off = from;
- int err = 0;
int depth;
int replaced_count = 0;
int dext_alen;
- mext_double_down_write(orig_inode, donor_inode);
+ /* Protect extent trees against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
/* Get the donor extent for the head */
- get_ext_path(donor_path, donor_inode, donor_off, err);
- if (donor_path == NULL)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
+ if (*err)
+ goto out;
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+ *err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ *err = -EIO;
+ goto out;
+ }
/* Set donor extent to orig extent */
- err = mext_leaf_block(handle, orig_inode,
+ *err = mext_leaf_block(handle, orig_inode,
orig_path, &tmp_dext, &orig_off);
- if (err < 0)
+ if (*err)
goto out;
/* Set orig extent to donor extent */
- err = mext_leaf_block(handle, donor_inode,
+ *err = mext_leaf_block(handle, donor_inode,
donor_path, &tmp_oext, &donor_off);
- if (err < 0)
+ if (*err)
goto out;
dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
- if (le32_to_cpu(oext->ee_block) +
- ext4_ext_get_actual_len(oext) <= orig_off) {
- err = 0;
- goto out;
- }
tmp_oext = *oext;
if (donor_path)
ext4_ext_drop_refs(donor_path);
- get_ext_path(donor_path, donor_inode,
- donor_off, err);
- if (donor_path == NULL)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
- if (le32_to_cpu(dext->ee_block) +
- ext4_ext_get_actual_len(dext) <= donor_off) {
- err = 0;
- goto out;
- }
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off,
- count - replaced_count);
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (*err)
+ goto out;
}
out:
@@ -720,8 +758,12 @@ out:
kfree(donor_path);
}
- mext_double_up_write(orig_inode, donor_inode);
- return err;
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ return replaced_count;
}
/**
@@ -733,16 +775,17 @@ out:
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @uninit: orig extent is uninitialized or not
+ * @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
* with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
- * on success, or a negative error value on failure.
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
*/
static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit)
+ int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
struct address_space *mapping = orig_inode->i_mapping;
@@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
- unsigned int tmp_data_len, data_len;
+ unsigned int tmp_data_size, data_size, replaced_size;
void *fsdata;
- int ret, i, jblocks;
+ int i, jblocks;
+ int err2 = 0;
+ int replaced_count = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
@@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
+ *err = PTR_ERR(handle);
+ return 0;
}
if (segment_eq(get_fs(), KERNEL_DS))
@@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
* Just swap data blocks between orig and donor.
*/
if (uninit) {
- ret = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page);
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
goto out2;
}
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
- /* Calculate data_len */
+ /* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
/* Replace the last block */
- tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
/*
- * If data_len equal zero, it shows data_len is multiples of
+ * If data_size equal zero, it shows data_size is multiples of
* blocksize. So we set appropriate value.
*/
- if (tmp_data_len == 0)
- tmp_data_len = blocksize;
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
- data_len = tmp_data_len +
+ data_size = tmp_data_size +
((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else {
- data_len = block_len_in_page << orig_inode->i_blkbits;
- }
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ replaced_size = data_size;
+
+ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
&page, &fsdata);
- if (unlikely(ret < 0))
+ if (unlikely(*err < 0))
goto out;
if (!PageUptodate(page)) {
@@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
/* Release old bh and drop refs */
try_to_release_page(page, 0);
- ret = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page);
- if (ret < 0)
- goto out;
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+ if (err2) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto out;
+ }
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
- ret = ext4_get_block(orig_inode,
+ *err = ext4_get_block(orig_inode,
(sector_t)(orig_blk_offset + i), bh, 0);
- if (ret < 0)
+ if (*err < 0)
goto out;
if (bh->b_this_page != NULL)
bh = bh->b_this_page;
}
- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
page, fsdata);
page = NULL;
@@ -871,11 +916,15 @@ out:
if (PageLocked(page))
unlock_page(page);
page_cache_release(page);
+ ext4_journal_stop(handle);
}
out2:
ext4_journal_stop(handle);
- return ret < 0 ? ret : 0;
+ if (err2)
+ *err = err2;
+
+ return replaced_count;
}
/**
@@ -886,7 +935,6 @@ out2:
* @orig_start: logical start offset in block for orig
* @donor_start: logical start offset in block for donor
* @len: the number of blocks to be moved
- * @moved_len: moved block length
*
* Check the arguments of ext4_move_extents() whether the files can be
* exchanged with each other.
@@ -894,9 +942,13 @@ out2:
*/
static int
mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len, __u64 moved_len)
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
@@ -921,14 +973,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -953,66 +997,63 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if (moved_len) {
- ext4_debug("ext4 move extent: moved_len should be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
- return -EINVAL;
- }
-
- if ((orig_start > MAX_DEFRAG_SIZE) ||
- (donor_start > MAX_DEFRAG_SIZE) ||
- (*len > MAX_DEFRAG_SIZE) ||
- (orig_start + *len > MAX_DEFRAG_SIZE)) {
- ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
- "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ if ((orig_start > EXT_MAX_BLOCK) ||
+ (donor_start > EXT_MAX_BLOCK) ||
+ (*len > EXT_MAX_BLOCK) ||
+ (orig_start + *len > EXT_MAX_BLOCK)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_inode->i_size > donor_inode->i_size) {
- if (orig_start >= donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file size "
- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
- orig_start, donor_inode->i_size,
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > donor_inode->i_size) {
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file size [%lld]."
- "So adjust length from %llu to %lld "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
"[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_inode->i_size,
- *len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_inode->i_size - orig_start;
+ *len = donor_blocks - orig_start;
}
} else {
- if (orig_start >= orig_inode->i_size) {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file size "
- "[%lld] [inode:orig %lu, donor %lu]\n",
- orig_start, orig_inode->i_size,
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > orig_inode->i_size) {
+ if (orig_start + *len > orig_blocks) {
ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %lld. Because it should be "
- "less than original file size "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
"[ino:orig %lu, donor %lu]\n",
- *len, orig_inode->i_size - orig_start,
+ *len, orig_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_inode->i_size - orig_start;
+ *len = orig_blocks - orig_start;
}
}
if (!*len) {
- ext4_debug("ext4 move extent: len shoudld not be 0 "
+ ext4_debug("ext4 move extent: len should not be 0 "
"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
donor_inode->i_ino);
return -EINVAL;
@@ -1027,18 +1068,23 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
- if (inode1)
- mutex_lock(&inode1->i_mutex);
- else if (inode2)
- mutex_lock(&inode2->i_mutex);
- return;
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
}
if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1094,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
+
+out:
+ return ret;
}
/**
@@ -1056,17 +1105,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
if (inode1)
mutex_unlock(&inode1->i_mutex);
if (inode2 && inode2 != inode1)
mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
}
/**
@@ -1123,70 +1183,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ int ret1, ret2, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
- /* protect orig and donor against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
- mext_double_down_read(orig_inode, donor_inode);
+ /* Protect extent tree against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
- donor_start, &len, *moved_len);
- mext_double_up_read(orig_inode, donor_inode);
- if (ret)
- goto out2;
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len);
+ if (ret1)
+ goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
block_end = block_start + len - 1;
if (file_end < block_end)
len -= block_end - file_end;
- get_ext_path(orig_path, orig_inode, block_start, ret);
- if (orig_path == NULL)
- goto out2;
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
/* Get path structure to check the hole */
- get_ext_path(holecheck_path, orig_inode, block_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
goto out;
depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL) {
- ret = -EINVAL;
- goto out;
- }
/*
- * Get proper extent whose ee_block is beyond block_start
- * if block_start was within the hole.
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
- }
- seq_start = block_start;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
/* No blocks within the specified range. */
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret = -EINVAL;
+ ret1 = -EINVAL;
goto out;
}
@@ -1206,7 +1280,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1246,29 +1320,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
seq_start = le32_to_cpu(ext_cur->ee_block);
rest_blocks = seq_blocks;
- /* Discard preallocations of two inodes */
- down_write(&EXT4_I(orig_inode)->i_data_sem);
- ext4_discard_preallocations(orig_inode);
- up_write(&EXT4_I(orig_inode)->i_data_sem);
-
- down_write(&EXT4_I(donor_inode)->i_data_sem);
- ext4_discard_preallocations(donor_inode);
- up_write(&EXT4_I(donor_inode)->i_data_sem);
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret = move_extent_par_page(o_filp, donor_inode,
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit);
- if (ret < 0)
- goto out;
- orig_page_offset++;
+ block_len_in_page, uninit,
+ &ret1);
+
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- BUG_ON(*moved_len > len);
+ if (ret1 < 0)
+ break;
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ break;
+ }
+ orig_page_offset++;
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
if (rest_blocks > blocks_per_page)
@@ -1277,20 +1361,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
+ double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret1 < 0)
+ break;
+
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- get_ext_path(holecheck_path, orig_inode,
- seq_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, seq_start, ret);
- if (orig_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1299,6 +1386,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
if (orig_path) {
ext4_ext_drop_refs(orig_path);
kfree(orig_path);
@@ -1307,14 +1399,13 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-out2:
- mext_inode_double_unlock(orig_inode, donor_inode);
-
- if (ret)
- return ret;
+ double_up_write_data_sem(orig_inode, donor_inode);
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
- /* All of the specified blocks must be exchanged in succeed */
- BUG_ON(*moved_len != len);
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..fde08c919d12 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
- bh, offset)) {
- brelse(bh);
+ bh, offset))
return -EIO;
- }
- if (ext4_match(namelen, name, de)) {
- brelse(bh);
+ if (ext4_match(namelen, name, de))
return -EEXIST;
- }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_journal_get_write_access(handle, bh);
if (err) {
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return err;
}
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return 0;
}
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (!(de))
return retval;
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if(!bh)
return retval;
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
+ if (retval != -ENOSPC) {
+ brelse(bh);
return retval;
+ }
if (blocks == 1 && !dx_fallback &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto journal_error;
err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
+ if (err != -ENOSPC)
goto cleanup;
- }
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
goto cleanup;
journal_error:
@@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- if (!ext4_handle_valid(handle))
+ /* ext4_handle_valid() assumes a valid handle_t pointer */
+ if (handle && !ext4_handle_valid(handle))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
/*
@@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ EXT4_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
@@ -2536,7 +2531,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fiemap = ext4_fiemap,
};
@@ -2548,5 +2543,5 @@ const struct inode_operations ext4_special_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
struct inode *inode = NULL;
handle_t *handle;
int gdb_off, gdb_num;
- int num_grp_locked = 0;
int err, err2;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* using the new disk blocks.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* descriptor
*/
err = ext4_mb_add_groupinfo(sb, input->group, gdp);
- if (err) {
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+ if (err)
goto exit_journal;
- }
/*
* Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
ext4_handle_dirty_metadata(handle, NULL, primary);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..5a2db612950b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,17 +45,11 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "mballoc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static int default_mb_history_length = 1000;
-
-module_param_named(default_mb_history_length, default_mb_history_length,
- int, 0644);
-MODULE_PARM_DESC(default_mb_history_length,
- "Default number of entries saved for mb_history");
-
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
@@ -188,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -214,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
}
return jbd2_journal_start(journal, nblocks);
}
- /*
- * We're not journaling, return the appropriate indication.
- */
- current->journal_info = EXT4_NOJOURNAL_HANDLE;
- return current->journal_info;
+ return ext4_get_nojournal();
}
/*
@@ -234,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int rc;
if (!ext4_handle_valid(handle)) {
- /*
- * Do this here since we don't call jbd2_journal_stop() in
- * no-journal mode.
- */
- current->journal_info = NULL;
+ ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
@@ -344,7 +360,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -578,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
@@ -682,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;
return &ei->vfs_inode;
}
@@ -877,6 +899,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NO_AUTO_DA_ALLOC))
seq_puts(seq, ",noauto_da_alloc");
+ if (test_opt(sb, DISCARD))
+ seq_puts(seq, ",discard");
+
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -962,7 +990,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext4_quota_operations = {
+static const struct dquot_operations ext4_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -983,7 +1011,7 @@ static struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1050,14 +1078,15 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
- Opt_inode_readahead_blks, Opt_journal_ioprio
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
};
static const match_table_t tokens = {
@@ -1082,6 +1111,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
@@ -1097,7 +1127,6 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
- {Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1123,6 +1152,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
};
@@ -1340,13 +1371,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
- case Opt_mb_history_length:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_mb_history_max = option;
- break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1551,6 +1575,12 @@ set_qf_format:
else
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
+ case Opt_discard:
+ set_opt(sbi->s_mount_opt, DISCARD);
+ break;
+ case Opt_nodiscard:
+ clear_opt(sbi->s_mount_opt, DISCARD);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
@@ -1646,13 +1676,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- if (EXT4_SB(sb)->s_journal) {
- ext4_msg(sb, KERN_INFO, "%s journal on %s",
- EXT4_SB(sb)->s_journal->j_inode ? "internal" :
- "external", EXT4_SB(sb)->s_journal->j_devname);
- } else {
- ext4_msg(sb, KERN_INFO, "no journal");
- }
return res;
}
@@ -1666,14 +1689,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
size_t size;
int i;
- if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ if (groups_per_flex < 2) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -1695,12 +1718,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
- ext4_free_inodes_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
- ext4_free_blks_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
- ext4_used_dirs_count(sb, gdp));
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_blks_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
return 1;
@@ -2197,6 +2220,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2234,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
NULL,
};
@@ -2253,6 +2278,49 @@ static struct kobj_type ext4_ktype = {
.release = ext4_sb_release,
};
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ return 1;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2274,7 +2342,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int db_count;
unsigned int i;
int needs_recovery, has_huge_files;
- int features;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2371,7 +2438,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
@@ -2401,39 +2467,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
- ~EXT4_FEATURE_INCOMPAT_SUPP));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount RDWR because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
- goto failed_mount;
- }
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (has_huge_files) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LBDAF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LBDAF");
- goto failed_mount;
- }
- }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2505,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2587,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ if ((ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+ (ext4_blocks_count(es) >
+ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
ext4_msg(sb, KERN_ERR, "filesystem"
- " too large to mount safely");
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = -EFBIG;
goto failed_mount;
}
@@ -2595,6 +2640,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2656,6 +2703,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
/*
* set up enough so that it can read an inode
@@ -2689,26 +2737,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3;
- if (!(sb->s_flags & MS_RDONLY) &&
- EXT4_SB(sb)->s_journal->j_failed_commit) {
- ext4_msg(sb, KERN_CRIT, "error: "
- "ext4_fill_super: Journal transaction "
- "%u is corrupt",
- EXT4_SB(sb)->s_journal->j_failed_commit);
- if (test_opt(sb, ERRORS_RO)) {
- ext4_msg(sb, KERN_CRIT,
- "Mounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- }
- if (test_opt(sb, ERRORS_PANIC)) {
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, 1);
- goto failed_mount4;
- }
- }
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -2781,6 +2809,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2832,12 +2866,12 @@ no_journal:
"available");
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt(sb, DELALLOC) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+ }
err = ext4_setup_system_zone(sb);
if (err) {
@@ -2893,6 +2927,8 @@ cantfind_ext4:
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -3147,9 +3183,7 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
- if (journal->j_flags & JBD2_BARRIER)
- ext4_msg(sb, KERN_INFO, "barriers enabled");
- else
+ if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3208,7 +3242,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3333,11 +3378,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
@@ -3477,18 +3524,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
} else {
- int ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- ext4_msg(sb, KERN_WARNING, "couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x)",
- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
-
/*
* Make sure the group descriptor checksums
* are sane. If they aren't, refuse to remount r/w.
@@ -3624,13 +3664,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
- ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3930,27 +3968,6 @@ static struct file_system_type ext4_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data,struct vfsmount *mnt)
-{
- printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
- "to mount using ext4\n", dev_name);
- printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
- "will go away by 2.6.31\n", dev_name);
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
-}
-
-static struct file_system_type ext4dev_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext4dev",
- .get_sb = ext4dev_get_sb,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS("ext4dev");
-#endif
-
static int __init init_ext4_fs(void)
{
int err;
@@ -3975,13 +3992,6 @@ static int __init init_ext4_fs(void)
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
-#ifdef CONFIG_EXT4DEV_COMPAT
- err = register_filesystem(&ext4dev_fs_type);
- if (err) {
- unregister_filesystem(&ext4_fs_type);
- goto out;
- }
-#endif
return 0;
out:
destroy_inodecache();
@@ -4000,9 +4010,6 @@ out4:
static void __exit exit_ext4_fs(void)
{
unregister_filesystem(&ext4_fs_type);
-#ifdef CONFIG_EXT4DEV_COMPAT
- unregister_filesystem(&ext4dev_fs_type);
-#endif
destroy_inodecache();
exit_ext4_xattr();
exit_ext4_mballoc();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..910bf9a59cb3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ea_bdebug(bh, "refcount now=0; freeing");
if (ce)
mb_cache_entry_free(ce);
- ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
get_bh(bh);
- ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ ext4_free_blocks(handle, inode, bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -810,18 +811,30 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
goal, NULL, &error);
if (error)
goto cleanup;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
- ext4_free_blocks(handle, inode, block, 1, 1);
+ ext4_free_blocks(handle, inode, 0, block, 1,
+ EXT4_FREE_BLOCKS_METADATA);
error = -EIO;
goto cleanup;
}
@@ -977,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1002,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..e6efdfa0f6db 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -44,7 +44,8 @@ struct fat_mount_options {
nocase:1, /* Does this need case conversion? 0=need case conversion*/
usefree:1, /* Use free_clusters for FAT32 */
tz_utc:1, /* Filesystem timestamps are in UTC */
- rodir:1; /* allow ATTR_RO for directory */
+ rodir:1, /* allow ATTR_RO for directory */
+ discard:1; /* Issue discard requests on deletions */
};
#define FAT_HASH_BITS 8
@@ -323,7 +324,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
/* fat/misc.c */
extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3))) __cold;
-extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
__le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a81037721a6f..81184d3b75a3 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster)
goto error;
}
- /*
- * Issue discard for the sectors we no longer care about,
- * batching contiguous clusters into one request
- */
- if (cluster != fatent.entry + 1) {
- int nr_clus = fatent.entry - first_cl + 1;
-
- sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
- nr_clus * sbi->sec_per_clus);
- first_cl = cluster;
+ if (sbi->options.discard) {
+ /*
+ * Issue discard for the sectors we no longer
+ * care about, batching contiguous clusters
+ * into one request
+ */
+ if (cluster != fatent.entry + 1) {
+ int nr_clus = fatent.entry - first_cl + 1;
+
+ sb_issue_discard(sb,
+ fat_clus_to_blknr(sbi, first_cl),
+ nr_clus * sbi->sec_per_clus);
+
+ first_cl = cluster;
+ }
}
ops->ent_put(&fatent, FAT_ENT_FREE);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- if (IS_SYNC(inode))
- err = sync_page_range_nolock(inode, mapping, start, count);
+ if (IS_SYNC(inode)) {
+ int err2;
+
+ /*
+ * Opencode syncing since we don't have a file open to use
+ * standard fsync path.
+ */
+ err = filemap_fdatawrite_range(mapping, start,
+ start + count - 1);
+ err2 = sync_mapping_buffers(mapping);
+ if (!err)
+ err = err2;
+ err2 = write_inode_now(inode, 1);
+ if (!err)
+ err = err2;
+ if (!err) {
+ err = filemap_fdatawait_range(mapping, start,
+ start + count - 1);
+ }
+ }
out:
return err;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c49bb0..14da530b05ca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
static int fat_sync_fs(struct super_block *sb, int wait)
{
- lock_super(sb);
- fat_clusters_flush(sb);
- sb->s_dirt = 0;
- unlock_super(sb);
+ int err = 0;
- return 0;
+ if (sb->s_dirt) {
+ lock_super(sb);
+ sb->s_dirt = 0;
+ err = fat_clusters_flush(sb);
+ unlock_super(sb);
+ }
+
+ return err;
}
static void fat_put_super(struct super_block *sb)
@@ -470,19 +474,11 @@ static void fat_put_super(struct super_block *sb)
iput(sbi->fat_inode);
- if (sbi->nls_disk) {
- unload_nls(sbi->nls_disk);
- sbi->nls_disk = NULL;
- sbi->options.codepage = fat_default_codepage;
- }
- if (sbi->nls_io) {
- unload_nls(sbi->nls_io);
- sbi->nls_io = NULL;
- }
- if (sbi->options.iocharset != fat_default_iocharset) {
+ unload_nls(sbi->nls_disk);
+ unload_nls(sbi->nls_io);
+
+ if (sbi->options.iocharset != fat_default_iocharset)
kfree(sbi->options.iocharset);
- sbi->options.iocharset = fat_default_iocharset;
- }
sb->s_fs_info = NULL;
kfree(sbi);
@@ -820,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",shortname=mixed");
break;
case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
- /* seq_puts(m, ",shortname=lower"); */
+ seq_puts(m, ",shortname=lower");
break;
default:
seq_puts(m, ",shortname=unknown");
@@ -862,6 +858,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",errors=panic");
else
seq_puts(m, ",errors=remount-ro");
+ if (opts->discard)
+ seq_puts(m, ",discard");
return 0;
}
@@ -875,7 +873,7 @@ enum {
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
- Opt_err_panic, Opt_err_ro, Opt_err,
+ Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
};
static const match_table_t fat_tokens = {
@@ -903,6 +901,7 @@ static const match_table_t fat_tokens = {
{Opt_err_cont, "errors=continue"},
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
+ {Opt_discard, "discard"},
{Opt_obsolate, "conv=binary"},
{Opt_obsolate, "conv=text"},
{Opt_obsolate, "conv=auto"},
@@ -971,7 +970,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
opts->codepage = fat_default_codepage;
opts->iocharset = fat_default_iocharset;
if (is_vfat) {
- opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
+ opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
opts->rodir = 0;
} else {
opts->shortname = 0;
@@ -1140,6 +1139,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
case Opt_rodir:
opts->rodir = 1;
break;
+ case Opt_discard:
+ opts->discard = 1;
+ break;
/* obsolete mount options */
case Opt_obsolate:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
/* Flushes the number of free clusters on FAT32 */
/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
-void fat_clusters_flush(struct super_block *sb)
+int fat_clusters_flush(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
struct fat_boot_fsinfo *fsinfo;
if (sbi->fat_bits != 32)
- return;
+ return 0;
bh = sb_bread(sb, sbi->fsinfo_sector);
if (bh == NULL) {
printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
- return;
+ return -EIO;
}
fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
mark_buffer_dirty(bh);
}
brelse(bh);
+
+ return 0;
}
/*
@@ -119,8 +121,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
MSDOS_I(inode)->i_start = new_dclus;
MSDOS_I(inode)->i_logstart = new_dclus;
/*
- * Since generic_osync_inode() synchronize later if
- * this is not directory, we don't here.
+ * Since generic_write_sync() synchronizes regular files later,
+ * we sync here only directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
ret = fat_sync_inode(inode);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
int charlen;
if (utf8) {
- int name_len = strlen(name);
-
- *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
-
- /*
- * We stripped '.'s before and set len appropriately,
- * but utf8s_to_utf16s doesn't care about len
- */
- *outlen -= (name_len - len);
-
- if (*outlen > 255)
+ *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+ if (*outlen < 0)
+ return *outlen;
+ else if (*outlen > 255)
return -ENAMETOOLONG;
op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..2cf93ec40a67 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
return pid;
}
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+ struct f_owner_ex * __user owner_p = (void * __user)arg;
+ struct f_owner_ex owner;
+ struct pid *pid;
+ int type;
+ int ret;
+
+ ret = copy_from_user(&owner, owner_p, sizeof(owner));
+ if (ret)
+ return ret;
+
+ switch (owner.type) {
+ case F_OWNER_TID:
+ type = PIDTYPE_MAX;
+ break;
+
+ case F_OWNER_PID:
+ type = PIDTYPE_PID;
+ break;
+
+ case F_OWNER_PGRP:
+ type = PIDTYPE_PGID;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ pid = find_vpid(owner.pid);
+ if (owner.pid && !pid)
+ ret = -ESRCH;
+ else
+ ret = __f_setown(filp, pid, type, 1);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+ struct f_owner_ex * __user owner_p = (void * __user)arg;
+ struct f_owner_ex owner;
+ int ret = 0;
+
+ read_lock(&filp->f_owner.lock);
+ owner.pid = pid_vnr(filp->f_owner.pid);
+ switch (filp->f_owner.pid_type) {
+ case PIDTYPE_MAX:
+ owner.type = F_OWNER_TID;
+ break;
+
+ case PIDTYPE_PID:
+ owner.type = F_OWNER_PID;
+ break;
+
+ case PIDTYPE_PGID:
+ owner.type = F_OWNER_PGRP;
+ break;
+
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ break;
+ }
+ read_unlock(&filp->f_owner.lock);
+
+ if (!ret)
+ ret = copy_to_user(owner_p, &owner, sizeof(owner));
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SETOWN:
err = f_setown(filp, arg, 1);
break;
+ case F_GETOWN_EX:
+ err = f_getown_ex(filp, arg);
+ break;
+ case F_SETOWN_EX:
+ err = f_setown_ex(filp, arg);
+ break;
case F_GETSIG:
err = filp->f_owner.signum;
break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
static void send_sigio_to_task(struct task_struct *p,
struct fown_struct *fown,
- int fd,
- int reason)
+ int fd, int reason, int group)
{
/*
* F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
else
si.si_band = band_table[reason - POLL_IN];
si.si_fd = fd;
- if (!group_send_sig_info(signum, &si, p))
+ if (!do_send_sig_info(signum, &si, p, group))
break;
/* fall-through: fall back on the old plain SIGIO signal */
case 0:
- group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
+ do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
}
}
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
struct task_struct *p;
enum pid_type type;
struct pid *pid;
+ int group = 1;
read_lock(&fown->lock);
+
type = fown->pid_type;
+ if (type == PIDTYPE_MAX) {
+ group = 0;
+ type = PIDTYPE_PID;
+ }
+
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
- send_sigio_to_task(p, fown, fd, band);
+ send_sigio_to_task(p, fown, fd, band, group);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
}
static void send_sigurg_to_task(struct task_struct *p,
- struct fown_struct *fown)
+ struct fown_struct *fown, int group)
{
if (sigio_perm(p, fown, SIGURG))
- group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
+ do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
}
int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
struct task_struct *p;
enum pid_type type;
struct pid *pid;
+ int group = 1;
int ret = 0;
read_lock(&fown->lock);
+
type = fown->pid_type;
+ if (type == PIDTYPE_MAX) {
+ group = 0;
+ type = PIDTYPE_PID;
+ }
+
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
- send_sigurg_to_task(p, fown);
+ send_sigurg_to_task(p, fown, group);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
diff --git a/fs/file.c b/fs/file.c
index f313314f996f..87e129030ab1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/time.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
* Handle nr_files sysctl
*/
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
- return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ return proc_dointvec(table, write, buffer, lenp, ppos);
}
#else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,257 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include "internal.h"
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue. Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own. Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
*/
-static int writeback_acquire(struct backing_dev_info *bdi)
+int nr_pdflush_threads;
+
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+ long nr_pages;
+ struct super_block *sb;
+ enum writeback_sync_modes sync_mode;
+ int for_kupdate:1;
+ int range_cyclic:1;
+ int for_background:1;
+};
+
+/*
+ * Work items for the bdi_writeback threads
+ */
+struct bdi_work {
+ struct list_head list; /* pending work list */
+ struct rcu_head rcu_head; /* for RCU free/clear of work */
+
+ unsigned long seen; /* threads that have seen this work */
+ atomic_t pending; /* number of threads still to do work */
+
+ struct wb_writeback_args args; /* writeback arguments */
+
+ unsigned long state; /* flag bits, see WS_* */
+};
+
+enum {
+ WS_USED_B = 0,
+ WS_ONSTACK_B,
+};
+
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
{
- return !test_and_set_bit(BDI_pdflush, &bdi->state);
+ return test_bit(WS_ONSTACK_B, &work->state);
+}
+
+static inline void bdi_work_init(struct bdi_work *work,
+ struct wb_writeback_args *args)
+{
+ INIT_RCU_HEAD(&work->rcu_head);
+ work->args = *args;
+ work->state = WS_USED;
}
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
*
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
*/
int writeback_in_progress(struct backing_dev_info *bdi)
{
- return test_bit(BDI_pdflush, &bdi->state);
+ return !list_empty(&bdi->work_list);
}
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
+static void bdi_work_clear(struct bdi_work *work)
{
- BUG_ON(!writeback_in_progress(bdi));
- clear_bit(BDI_pdflush, &bdi->state);
+ clear_bit(WS_USED_B, &work->state);
+ smp_mb__after_clear_bit();
+ /*
+ * work can have disappeared at this point. bit waitq functions
+ * should be able to tolerate this, provided bdi_sched_wait does
+ * not dereference it's pointer argument.
+ */
+ wake_up_bit(&work->state, WS_USED_B);
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
+ struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
+ if (!bdi_work_on_stack(work))
+ kfree(work);
+ else
+ bdi_work_clear(work);
}
-/**
- * __mark_inode_dirty - internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages. This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
+static void wb_work_complete(struct bdi_work *work)
{
- struct super_block *sb = inode->i_sb;
+ const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+ int onstack = bdi_work_on_stack(work);
/*
- * Don't do this for I_DIRTY_PAGES - that doesn't actually
- * dirty the inode itself
+ * For allocated work, we can clear the done/seen bit right here.
+ * For on-stack work, we need to postpone both the clear and free
+ * to after the RCU grace period, since the stack could be invalidated
+ * as soon as bdi_work_clear() has done the wakeup.
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
- }
+ if (!onstack)
+ bdi_work_clear(work);
+ if (sync_mode == WB_SYNC_NONE || onstack)
+ call_rcu(&work->rcu_head, bdi_work_free);
+}
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
/*
- * make sure that changes are seen by all cpus before we test i_state
- * -- mikulas
+ * The caller has retrieved the work arguments from this work,
+ * drop our reference. If this is the last ref, delete and free it
*/
- smp_mb();
+ if (atomic_dec_and_test(&work->pending)) {
+ struct backing_dev_info *bdi = wb->bdi;
- /* avoid the locking if we can */
- if ((inode->i_state & flags) == flags)
- return;
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&work->list);
+ spin_unlock(&bdi->wb_lock);
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
+ wb_work_complete(work);
+ }
+}
- spin_lock(&inode_lock);
- if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+ work->seen = bdi->wb_mask;
+ BUG_ON(!work->seen);
+ atomic_set(&work->pending, bdi->wb_cnt);
+ BUG_ON(!bdi->wb_cnt);
- inode->i_state |= flags;
+ /*
+ * list_add_tail_rcu() contains the necessary barriers to
+ * make sure the above stores are seen before the item is
+ * noticed on the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&work->list, &bdi->work_list);
+ spin_unlock(&bdi->wb_lock);
- /*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
- */
- if (inode->i_state & I_SYNC)
- goto out;
+ /*
+ * If the default thread isn't there, make sure we add it. When
+ * it gets created and wakes up, we'll run this work.
+ */
+ if (unlikely(list_empty_careful(&bdi->wb_list)))
+ wake_up_process(default_backing_dev_info.wb.task);
+ else {
+ struct bdi_writeback *wb = &bdi->wb;
- /*
- * Only add valid (hashed) inodes to the superblock's
- * dirty list. Add blockdev inodes as well.
- */
- if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
- goto out;
- }
- if (inode->i_state & (I_FREEING|I_CLEAR))
- goto out;
+ if (wb->task)
+ wake_up_process(wb->task);
+ }
+}
- /*
- * If the inode was already on s_dirty/s_io/s_more_io, don't
- * reposition it (that would break s_dirty time-ordering).
- */
- if (!was_dirty) {
- inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
- }
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+ wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_args *args)
+{
+ struct bdi_work *work;
+
+ /*
+ * This is WB_SYNC_NONE writeback, so if allocation fails just
+ * wakeup the thread for old dirty data writeback
+ */
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ bdi_work_init(work, args);
+ bdi_queue_work(bdi, work);
+ } else {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ if (wb->task)
+ wake_up_process(wb->task);
}
-out:
- spin_unlock(&inode_lock);
}
-EXPORT_SYMBOL(__mark_inode_dirty);
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ * This does WB_SYNC_ALL data integrity writeback and waits for the
+ * IO to complete. Callers must hold the sb s_umount semaphore for
+ * reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+ struct super_block *sb)
+{
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ };
+ struct bdi_work work;
-static int write_inode(struct inode *inode, int sync)
+ bdi_work_init(&work, &args);
+ work.state |= WS_ONSTACK;
+
+ bdi_queue_work(bdi, &work);
+ bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * started when this function returns, we make no guarentees on
+ * completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+ long nr_pages)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, sync);
- return 0;
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_pages = nr_pages,
+ .range_cyclic = 1,
+ };
+
+ /*
+ * We treat @nr_pages=0 as the special case to do background writeback,
+ * ie. to sync pages until the background dirty threshold is reached.
+ */
+ if (!nr_pages) {
+ args.nr_pages = LONG_MAX;
+ args.for_background = 1;
+ }
+
+ bdi_alloc_queue_work(bdi, &args);
}
/*
@@ -191,31 +277,32 @@ static int write_inode(struct inode *inode, int sync)
* furthest end of its superblock's dirty-inode list.
*
* Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list. If that is
+ * already the most-recently-dirtied inode on the b_dirty list. If that is
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- if (!list_empty(&sb->s_dirty)) {
- struct inode *tail_inode;
+ if (!list_empty(&wb->b_dirty)) {
+ struct inode *tail;
- tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
- if (time_before(inode->dirtied_when,
- tail_inode->dirtied_when))
+ tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &wb->b_dirty);
}
/*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode)
{
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+ list_move(&inode->i_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -235,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
* For inodes being constantly redirtied, dirtied_when can get stuck.
* It _appears_ to be in the future, but is actually in distant past.
* This test is necessary to prevent such wrapped-around relative times
- * from permanently stopping the whole pdflush writeback.
+ * from permanently stopping the whole bdi writeback.
*/
ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
@@ -249,33 +336,56 @@ static void move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
unsigned long *older_than_this)
{
+ LIST_HEAD(tmp);
+ struct list_head *pos, *node;
+ struct super_block *sb = NULL;
+ struct inode *inode;
+ int do_sb_sort = 0;
+
while (!list_empty(delaying_queue)) {
- struct inode *inode = list_entry(delaying_queue->prev,
- struct inode, i_list);
+ inode = list_entry(delaying_queue->prev, struct inode, i_list);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
- list_move(&inode->i_list, dispatch_queue);
+ if (sb && sb != inode->i_sb)
+ do_sb_sort = 1;
+ sb = inode->i_sb;
+ list_move(&inode->i_list, &tmp);
+ }
+
+ /* just one sb in list, splice to dispatch_queue and we're done */
+ if (!do_sb_sort) {
+ list_splice(&tmp, dispatch_queue);
+ return;
+ }
+
+ /* Move inodes from one superblock together */
+ while (!list_empty(&tmp)) {
+ inode = list_entry(tmp.prev, struct inode, i_list);
+ sb = inode->i_sb;
+ list_for_each_prev_safe(pos, node, &tmp) {
+ inode = list_entry(pos, struct inode, i_list);
+ if (inode->i_sb == sb)
+ list_move(&inode->i_list, dispatch_queue);
+ }
}
}
/*
* Queue all expired dirty inodes for io, eldest first.
*/
-static void queue_io(struct super_block *sb,
- unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- list_splice_init(&sb->s_more_io, sb->s_io.prev);
- move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+ list_splice_init(&wb->b_more_io, wb->b_io.prev);
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
{
- return !list_empty(&sb->s_dirty) ||
- !list_empty(&sb->s_io) ||
- !list_empty(&sb->s_more_io);
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+ return inode->i_sb->s_op->write_inode(inode, sync);
+ return 0;
}
-EXPORT_SYMBOL(sb_has_dirty_inodes);
/*
* Wait for writeback on an inode to complete.
@@ -322,11 +432,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_state & I_SYNC) {
/*
* If this inode is locked for writeback and we are not doing
- * writeback-for-data-integrity, move it to s_more_io so that
+ * writeback-for-data-integrity, move it to b_more_io so that
* writeback can proceed with the other inodes on s_io.
*
* We'll have another go at writing back this inode when we
- * completed a full scan of s_io.
+ * completed a full scan of b_io.
*/
if (!wait) {
requeue_io(inode);
@@ -366,16 +476,26 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
- if (!(inode->i_state & I_DIRTY) &&
- mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
+ /*
+ * More pages get dirtied by a fast dirtier.
+ */
+ goto select_queue;
+ } else if (inode->i_state & I_DIRTY) {
+ /*
+ * At least XFS will redirty the inode during the
+ * writeback (delalloc) and on io completion (isize).
+ */
+ redirty_tail(inode);
+ } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
- * the inode; Move it from s_io onto s_more_io/s_dirty.
+ * the inode; Move it from b_io onto b_more_io/b_dirty.
*/
/*
* akpm: if the caller was the kupdate function we put
- * this inode at the head of s_dirty so it gets first
+ * this inode at the head of b_dirty so it gets first
* consideration. Otherwise, move it to the tail, for
* the reasons described there. I'm not really sure
* how much sense this makes. Presumably I had a good
@@ -385,10 +505,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->for_kupdate) {
/*
* For the kupdate function we move the inode
- * to s_more_io so it will get more writeout as
+ * to b_more_io so it will get more writeout as
* soon as the queue becomes uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
+select_queue:
if (wbc->nr_to_write <= 0) {
/*
* slice used up: queue for next turn
@@ -411,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_state |= I_DIRTY_PAGES;
redirty_tail(inode);
}
- } else if (inode->i_state & I_DIRTY) {
- /*
- * Someone redirtied the inode while were writing back
- * the pages.
- */
- redirty_tail(inode);
} else if (atomic_read(&inode->i_count)) {
/*
* The inode is clean, inuse
@@ -433,51 +548,96 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
+static void unpin_sb_for_writeback(struct super_block **psb)
+{
+ struct super_block *sb = *psb;
+
+ if (sb) {
+ up_read(&sb->s_umount);
+ put_super(sb);
+ *psb = NULL;
+ }
+}
+
/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
*
- * FIXME: this linear search could get expensive with many fileystems. But
- * how to fix? We need to go from an address_space to all inodes which share
- * a queue with that address_space. (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io. They are moved back onto
- * sb->s_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
+ * 1 if we failed.
*/
-void generic_sync_sb_inodes(struct super_block *sb,
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+ struct inode *inode, struct super_block **psb)
+{
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * If this sb is already pinned, nothing more to do. If not and
+ * *psb is non-NULL, unpin the old one first
+ */
+ if (sb == *psb)
+ return 0;
+ else if (*psb)
+ unpin_sb_for_writeback(psb);
+
+ /*
+ * Caller must already hold the ref for this
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ return 0;
+ }
+
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ if (down_read_trylock(&sb->s_umount)) {
+ if (sb->s_root) {
+ spin_unlock(&sb_lock);
+ goto pinned;
+ }
+ /*
+ * umounted, drop rwsem again and fall through to failure
+ */
+ up_read(&sb->s_umount);
+ }
+
+ sb->s_count--;
+ spin_unlock(&sb_lock);
+ return 1;
+pinned:
+ *psb = sb;
+ return 0;
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
+ struct super_block *sb = wbc->sb, *pin_sb = NULL;
+ const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
- int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&sb->s_io))
- queue_io(sb, wbc->older_than_this);
- while (!list_empty(&sb->s_io)) {
- struct inode *inode = list_entry(sb->s_io.prev,
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ queue_io(wb, wbc->older_than_this);
+
+ while (!list_empty(&wb->b_io)) {
+ struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
- struct address_space *mapping = inode->i_mapping;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
- if (!bdi_cap_writeback_dirty(bdi)) {
+ /*
+ * super block given and doesn't match, skip this inode
+ */
+ if (sb && sb != inode->i_sb) {
redirty_tail(inode);
- if (sb_is_blkdev_sb(sb)) {
+ continue;
+ }
+
+ if (!bdi_cap_writeback_dirty(wb->bdi)) {
+ redirty_tail(inode);
+ if (is_blkdev_sb) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
@@ -497,21 +657,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
continue;
}
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
wbc->encountered_congestion = 1;
- if (!sb_is_blkdev_sb(sb))
+ if (!is_blkdev_sb)
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
- if (wbc->bdi && bdi != wbc->bdi) {
- if (!sb_is_blkdev_sb(sb))
- break; /* fs has the wrong queue */
- requeue_io(inode);
- continue; /* blockdev has wrong queue */
- }
-
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -519,16 +672,15 @@ void generic_sync_sb_inodes(struct super_block *sb,
if (inode_dirtied_after(inode, start))
break;
- /* Is another pdflush already flushing this queue? */
- if (current_is_pdflush() && !writeback_acquire(bdi))
- break;
+ if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
+ requeue_io(inode);
+ continue;
+ }
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- if (current_is_pdflush())
- writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -544,144 +696,535 @@ void generic_sync_sb_inodes(struct super_block *sb,
wbc->more_io = 1;
break;
}
- if (!list_empty(&sb->s_more_io))
+ if (!list_empty(&wb->b_more_io))
wbc->more_io = 1;
}
- if (sync) {
- struct inode *inode, *old_inode = NULL;
+ unpin_sb_for_writeback(&pin_sb);
+ spin_unlock(&inode_lock);
+ /* Leave any unwritten inodes on b_io */
+}
+
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = wbc->bdi;
+
+ writeback_inodes_wb(&bdi->wb, wbc);
+}
+
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
+static inline bool over_bground_thresh(void)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+ return (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+ struct wb_writeback_args *args)
+{
+ struct writeback_control wbc = {
+ .bdi = wb->bdi,
+ .sb = args->sb,
+ .sync_mode = args->sync_mode,
+ .older_than_this = NULL,
+ .for_kupdate = args->for_kupdate,
+ .range_cyclic = args->range_cyclic,
+ };
+ unsigned long oldest_jif;
+ long wrote = 0;
+ struct inode *inode;
+
+ if (wbc.for_kupdate) {
+ wbc.older_than_this = &oldest_jif;
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ }
+ if (!wbc.range_cyclic) {
+ wbc.range_start = 0;
+ wbc.range_end = LLONG_MAX;
+ }
+
+ for (;;) {
/*
- * Data integrity sync. Must wait for all pages under writeback,
- * because there may have been pages dirtied before our sync
- * call, but which had writeout started before we write it out.
- * In which case, the inode may not be on the dirty list, but
- * we still have to wait for that writeout.
+ * Stop writeback when nr_pages has been consumed
*/
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- struct address_space *mapping;
-
- if (inode->i_state &
- (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
- continue;
- mapping = inode->i_mapping;
- if (mapping->nrpages == 0)
- continue;
- __iget(inode);
- spin_unlock(&inode_lock);
- /*
- * We hold a reference to 'inode' so it couldn't have
- * been removed from s_inodes list while we dropped the
- * inode_lock. We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it
- * under inode_lock. So we keep the reference and iput
- * it later.
- */
- iput(old_inode);
- old_inode = inode;
+ if (args->nr_pages <= 0)
+ break;
- filemap_fdatawait(mapping);
+ /*
+ * For background writeout, stop when we are below the
+ * background dirty threshold
+ */
+ if (args->for_background && !over_bground_thresh())
+ break;
- cond_resched();
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.pages_skipped = 0;
+ writeback_inodes_wb(wb, &wbc);
+ args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- spin_lock(&inode_lock);
+ /*
+ * If we consumed everything, see if we have more
+ */
+ if (wbc.nr_to_write <= 0)
+ continue;
+ /*
+ * Didn't write everything and we don't have more IO, bail
+ */
+ if (!wbc.more_io)
+ break;
+ /*
+ * Did we write something? Try for more
+ */
+ if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ continue;
+ /*
+ * Nothing written. Wait for some inode to
+ * become available for writeback. Otherwise
+ * we'll just busyloop.
+ */
+ spin_lock(&inode_lock);
+ if (!list_empty(&wb->b_more_io)) {
+ inode = list_entry(wb->b_more_io.prev,
+ struct inode, i_list);
+ inode_wait_for_writeback(inode);
}
spin_unlock(&inode_lock);
- iput(old_inode);
- } else
- spin_unlock(&inode_lock);
+ }
- return; /* Leave any unwritten inodes on s_io */
+ return wrote;
}
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-static void sync_sb_inodes(struct super_block *sb,
- struct writeback_control *wbc)
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct bdi_work *work, *ret = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(work, &bdi->work_list, list) {
+ if (!test_bit(wb->nr, &work->seen))
+ continue;
+ clear_bit(wb->nr, &work->seen);
+
+ ret = work;
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+ unsigned long expired;
+ long nr_pages;
+
+ expired = wb->last_old_flush +
+ msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (time_before(jiffies, expired))
+ return 0;
+
+ wb->last_old_flush = jiffies;
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ if (nr_pages) {
+ struct wb_writeback_args args = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .for_kupdate = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &args);
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+ struct backing_dev_info *bdi = wb->bdi;
+ struct bdi_work *work;
+ long wrote = 0;
+
+ while ((work = get_next_work_item(bdi, wb)) != NULL) {
+ struct wb_writeback_args args = work->args;
+
+ /*
+ * Override sync mode, in case we must wait for completion
+ */
+ if (force_wait)
+ work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+
+ /*
+ * If this isn't a data integrity operation, just notify
+ * that we have seen this work and we are now starting it.
+ */
+ if (args.sync_mode == WB_SYNC_NONE)
+ wb_clear_pending(wb, work);
+
+ wrote += wb_writeback(wb, &args);
+
+ /*
+ * This is a data integrity writeback, so only do the
+ * notification when we have completed the work.
+ */
+ if (args.sync_mode == WB_SYNC_ALL)
+ wb_clear_pending(wb, work);
+ }
+
+ /*
+ * Check for periodic writeback, kupdated() style
+ */
+ wrote += wb_check_old_data_flush(wb);
+
+ return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+ unsigned long last_active = jiffies;
+ unsigned long wait_jiffies = -1UL;
+ long pages_written;
+
+ while (!kthread_should_stop()) {
+ pages_written = wb_do_writeback(wb, 0);
+
+ if (pages_written)
+ last_active = jiffies;
+ else if (wait_jiffies != -1UL) {
+ unsigned long max_idle;
+
+ /*
+ * Longest period of inactivity that we tolerate. If we
+ * see dirty data again later, the task will get
+ * recreated automatically.
+ */
+ max_idle = max(5UL * 60 * HZ, wait_jiffies);
+ if (time_after(jiffies, max_idle + last_active))
+ break;
+ }
+
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ try_to_freeze();
+ }
+
+ return 0;
+}
+
+/*
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
+ */
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
{
- generic_sync_sb_inodes(sb, wbc);
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ };
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_alloc_queue_work(bdi, &args);
+ }
+
+ rcu_read_unlock();
}
/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ bdi_writeback_all(NULL, nr_pages);
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+ if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+ struct dentry *dentry;
+ const char *name = "?";
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ spin_lock(&dentry->d_lock);
+ name = (const char *) dentry->d_name.name;
+ }
+ printk(KERN_DEBUG
+ "%s(%d): dirtied inode %lu (%s) on %s\n",
+ current->comm, task_pid_nr(current), inode->i_ino,
+ name, inode->i_sb->s_id);
+ if (dentry) {
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
+ }
+}
+
+/**
+ * __mark_inode_dirty - internal function
+ * @inode: inode to mark
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
+ * mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
*
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
*
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
*
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones. One group will be the dirty
- * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
- * super-efficient but we're about to do a ton of I/O...
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages. This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
*/
-void
-writeback_inodes(struct writeback_control *wbc)
+void __mark_inode_dirty(struct inode *inode, int flags)
{
- struct super_block *sb;
+ struct super_block *sb = inode->i_sb;
- might_sleep();
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry_reverse(sb, &super_blocks, s_list) {
- if (sb_has_dirty_inodes(sb)) {
- /* we're making our own get_super here */
- sb->s_count++;
- spin_unlock(&sb_lock);
- /*
- * If we can't get the readlock, there's no sense in
- * waiting around, most of the time the FS is going to
- * be unmounted by the time it is released.
- */
- if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
- sync_sb_inodes(sb, wbc);
- up_read(&sb->s_umount);
+ /*
+ * Don't do this for I_DIRTY_PAGES - that doesn't actually
+ * dirty the inode itself
+ */
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (sb->s_op->dirty_inode)
+ sb->s_op->dirty_inode(inode);
+ }
+
+ /*
+ * make sure that changes are seen by all cpus before we test i_state
+ * -- mikulas
+ */
+ smp_mb();
+
+ /* avoid the locking if we can */
+ if ((inode->i_state & flags) == flags)
+ return;
+
+ if (unlikely(block_dump))
+ block_dump___mark_inode_dirty(inode);
+
+ spin_lock(&inode_lock);
+ if ((inode->i_state & flags) != flags) {
+ const int was_dirty = inode->i_state & I_DIRTY;
+
+ inode->i_state |= flags;
+
+ /*
+ * If the inode is being synced, just update its dirty state.
+ * The unlocker will place the inode on the appropriate
+ * superblock list, based upon its state.
+ */
+ if (inode->i_state & I_SYNC)
+ goto out;
+
+ /*
+ * Only add valid (hashed) inodes to the superblock's
+ * dirty list. Add blockdev inodes as well.
+ */
+ if (!S_ISBLK(inode->i_mode)) {
+ if (hlist_unhashed(&inode->i_hash))
+ goto out;
+ }
+ if (inode->i_state & (I_FREEING|I_CLEAR))
+ goto out;
+
+ /*
+ * If the inode was already on b_dirty/b_io/b_more_io, don't
+ * reposition it (that would break b_dirty time-ordering).
+ */
+ if (!was_dirty) {
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ struct backing_dev_info *bdi = wb->bdi;
+
+ if (bdi_cap_writeback_dirty(bdi) &&
+ !test_bit(BDI_registered, &bdi->state)) {
+ WARN_ON(1);
+ printk(KERN_ERR "bdi-%s not registered\n",
+ bdi->name);
}
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
+
+ inode->dirtied_when = jiffies;
+ list_move(&inode->i_list, &wb->b_dirty);
}
- if (wbc->nr_to_write <= 0)
- break;
}
- spin_unlock(&sb_lock);
+out:
+ spin_unlock(&inode_lock);
}
+EXPORT_SYMBOL(__mark_inode_dirty);
/*
- * writeback and wait upon the filesystem's dirty inodes. The caller will
- * do this in two passes - one to write, and one to wait.
+ * Write out a superblock's list of dirty inodes. A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
*
- * A finite limit is set on the number of pages which will be written.
- * To prevent infinite livelock of sys_sync().
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched. For other superblocks,
+ * assume that all inodes are backed by the same queue.
*
- * We add in the number of potentially dirty inodes, because each inode write
- * can dirty pagecache in the underlying blockdev.
+ * The inodes to be written are parked on bdi->b_io. They are moved back onto
+ * bdi->b_dirty as they are selected for writing. This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-void sync_inodes_sb(struct super_block *sb, int wait)
+static void wait_sb_inodes(struct super_block *sb)
{
- struct writeback_control wbc = {
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ struct inode *inode, *old_inode = NULL;
+
+ /*
+ * We need to be protected against the filesystem going from
+ * r/o to r/w or vice versa.
+ */
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ spin_lock(&inode_lock);
- if (!wait) {
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ /*
+ * Data integrity sync. Must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync
+ * call, but which had writeout started before we write it out.
+ * In which case, the inode may not be on the dirty list, but
+ * we still have to wait for that writeout.
+ */
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct address_space *mapping;
- wbc.nr_to_write = nr_dirty + nr_unstable +
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+ continue;
+ mapping = inode->i_mapping;
+ if (mapping->nrpages == 0)
+ continue;
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have
+ * been removed from s_inodes list while we dropped the
+ * inode_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it
+ * under inode_lock. So we keep the reference and iput
+ * it later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ filemap_fdatawait(mapping);
+
+ cond_resched();
+
+ spin_lock(&inode_lock);
+ }
+ spin_unlock(&inode_lock);
+ iput(old_inode);
+}
+
+/**
+ * writeback_inodes_sb - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ long nr_to_write;
+
+ nr_to_write = nr_dirty + nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- } else
- wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
- sync_sb_inodes(sb, &wbc);
+ bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb - sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+ bdi_sync_writeback(sb->s_bdi, sb);
+ wait_sb_inodes(sb);
+}
+EXPORT_SYMBOL(sync_inodes_sb);
/**
* write_inode_now - write an inode to disk
@@ -737,57 +1280,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what: what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- * OSYNC_DATA: i_mapping's dirty data
- * OSYNC_METADATA: the buffers at i_mapping->private_list
- * OSYNC_INODE: the inode itself
- */
-
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
- int err = 0;
- int need_write_inode_now = 0;
- int err2;
-
- if (what & OSYNC_DATA)
- err = filemap_fdatawrite(mapping);
- if (what & (OSYNC_METADATA|OSYNC_DATA)) {
- err2 = sync_mapping_buffers(mapping);
- if (!err)
- err = err2;
- }
- if (what & OSYNC_DATA) {
- err2 = filemap_fdatawait(mapping);
- if (!err)
- err = err2;
- }
-
- spin_lock(&inode_lock);
- if ((inode->i_state & I_DIRTY) &&
- ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
- need_write_inode_now = 1;
- spin_unlock(&inode_lock);
-
- if (need_write_inode_now) {
- err2 = write_inode_now(inode, 1);
- if (!err)
- err = err2;
- }
- else
- inode_sync_wait(inode);
-
- return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
return simple_read_from_buffer(buf, len, ppos, tmp, size);
}
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ char tmp[32];
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos || count >= sizeof(tmp) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(tmp, buf, count))
+ return -EINVAL;
+
+ tmp[count] = '\0';
+
+ err = strict_strtoul(tmp, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->max_background;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->max_background = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->congestion_threshold;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->congestion_threshold = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
.read = fuse_conn_waiting_read,
};
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+};
+
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
goto err;
if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
- NULL, &fuse_ctl_waiting_ops) ||
+ NULL, &fuse_ctl_waiting_ops) ||
!fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
- NULL, &fuse_ctl_abort_ops))
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
goto err;
return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
d_drop(dentry);
dput(dentry);
}
- fuse_control_sb->s_root->d_inode->i_nlink--;
+ drop_nlink(fuse_control_sb->s_root->d_inode);
}
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
static void flush_bg_queue(struct fuse_conn *fc)
{
- while (fc->active_background < FUSE_MAX_BACKGROUND &&
+ while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
@@ -280,11 +280,11 @@ __releases(&fc->lock)
list_del(&req->intr_entry);
req->state = FUSE_REQ_FINISHED;
if (req->background) {
- if (fc->num_background == FUSE_MAX_BACKGROUND) {
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
{
req->background = 1;
fc->num_background++;
- if (fc->num_background == FUSE_MAX_BACKGROUND)
+ if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->bdi_initialized) {
set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e703654e7f40..8ada78aade58 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -712,8 +712,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (newent->d_inode)
+ if (newent->d_inode) {
+ fuse_invalidate_attr(newent->d_inode);
fuse_invalidate_entry_cache(newent);
+ }
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -1276,14 +1278,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
return 0;
if (attr->ia_valid & ATTR_SIZE) {
- unsigned long limit;
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
+ err = inode_newsize_ok(inode, attr->ia_size);
+ if (err)
+ return err;
is_truncate = true;
}
@@ -1350,8 +1347,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
- if (outarg.attr.size < oldsize)
- fuse_truncate(inode->i_mapping, outarg.attr.size);
+ truncate_pagecache(inode, oldsize, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6f..c18913a777ae 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
break;
}
}
- fuse_put_request(fc, req);
+ if (!IS_ERR(req))
+ fuse_put_request(fc, req);
if (res > 0)
*ppos = pos;
@@ -1313,7 +1314,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return 0;
}
-static struct vm_operations_struct fuse_file_vm_ops = {
+static const struct vm_operations_struct fuse_file_vm_ops = {
.close = fuse_vma_close,
.fault = filemap_fault,
.page_mkwrite = fuse_page_mkwrite,
@@ -1599,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
kaddr += copy;
}
- kunmap(map);
+ kunmap(page);
}
return 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..01cc462ff45d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -38,7 +32,7 @@
#define FUSE_NAME_MAX 1024
/** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
/** Global mutex protecting fuse_conn_list and the control filesystem */
extern struct mutex fuse_mutex;
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
/** Number of requests currently in the background */
unsigned num_background;
@@ -602,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid);
-void fuse_truncate(struct address_space *mapping, loff_t offset);
-
/**
* Initialize the client device
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..1a822ce2b24b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
#define FUSE_SUPER_MAGIC 0x65735546
#define FUSE_DEFAULT_BLKSIZE 512
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
struct fuse_mount_data {
int fd;
unsigned rootmode;
@@ -115,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
return 0;
}
-void fuse_truncate(struct address_space *mapping, loff_t offset)
-{
- /* See vmtruncate() */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-}
-
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid)
{
@@ -180,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
spin_unlock(&fc->lock);
if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
- if (attr->size < oldsize)
- fuse_truncate(inode->i_mapping, attr->size);
+ truncate_pagecache(inode, oldsize, attr->size);
invalidate_inode_pages2(inode->i_mapping);
}
}
@@ -517,6 +533,8 @@ void fuse_conn_init(struct fuse_conn *fc)
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
@@ -727,6 +745,54 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
+static void sanitize_global_limit(unsigned *limit)
+{
+ if (*limit == 0)
+ *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ sizeof(struct fuse_req);
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+}
+
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +802,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
else {
unsigned long ra_pages;
+ process_init_limits(fc, arg);
+
if (arg->minor >= 6) {
ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +869,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ fc->bdi.name = "fuse";
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
/* fuse does it's own writeback accounting */
@@ -893,6 +962,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_put_conn;
+ sb->s_bdi = &fc->bdi;
+
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1147,6 +1218,9 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
return 0;
err_sysfs_cleanup:
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d2090..4dcddf83326f 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,8 @@ config GFS2_FS
select FS_POSIX_ACL
select CRC32
select SLOW_WORK
+ select QUOTA
+ select QUOTACTL
help
A cluster filesystem.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
EXTRA_CFLAGS := -I$(src)
obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
glops.o inode.o log.o lops.o main.o meta_io.o \
aops.o dentry.o export.o file.o \
ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3eb1ea846173 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/gfs2_ondisk.h>
@@ -19,128 +20,51 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
#include "trans.h"
#include "util.h"
-#define ACL_ACCESS 1
-#define ACL_DEFAULT 0
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode)
+static const char *gfs2_acl_name(int type)
{
- struct posix_acl *acl;
- int error;
-
- error = gfs2_acl_validate_remove(ip, access);
- if (error)
- return error;
-
- if (!er->er_data)
- return -EINVAL;
-
- acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl) {
- *remove = 1;
- return 0;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out;
-
- if (access) {
- error = posix_acl_equiv_mode(acl, mode);
- if (!error)
- *remove = 1;
- else if (error > 0)
- error = 0;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ return GFS2_POSIX_ACL_ACCESS;
+ case ACL_TYPE_DEFAULT:
+ return GFS2_POSIX_ACL_DEFAULT;
}
-
-out:
- posix_acl_release(acl);
- return error;
+ return NULL;
}
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
{
- if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(&ip->i_inode))
- return -EPERM;
- if (S_ISLNK(ip->i_inode.i_mode))
- return -EOPNOTSUPP;
- if (!access && !S_ISDIR(ip->i_inode.i_mode))
- return -EACCES;
-
- return 0;
-}
-
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
- struct gfs2_ea_location *el, char **data, unsigned int *len)
-{
- struct gfs2_ea_request er;
- struct gfs2_ea_location el_this;
- int error;
+ struct posix_acl *acl;
+ const char *name;
+ char *data;
+ int len;
if (!ip->i_eattr)
- return 0;
+ return NULL;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- if (access) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- } else {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- }
- er.er_type = GFS2_EATYPE_SYS;
+ acl = get_cached_acl(&ip->i_inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
- if (!el)
- el = &el_this;
+ name = gfs2_acl_name(type);
+ if (name == NULL)
+ return ERR_PTR(-EINVAL);
- error = gfs2_ea_find(ip, &er, el);
- if (error)
- return error;
- if (!el->el_ea)
- return 0;
- if (!GFS2_EA_DATA_LEN(el->el_ea))
- goto out;
-
- er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
- er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
- error = -ENOMEM;
- if (!er.er_data)
- goto out;
-
- error = gfs2_ea_get_copy(ip, el, er.er_data);
- if (error)
- goto out_kfree;
+ len = gfs2_xattr_acl_get(ip, name, &data);
+ if (len < 0)
+ return ERR_PTR(len);
+ if (len == 0)
+ return NULL;
- if (acl) {
- *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
- if (IS_ERR(*acl))
- error = PTR_ERR(*acl);
- }
-
-out_kfree:
- if (error || !data)
- kfree(er.er_data);
- else {
- *data = er.er_data;
- *len = er.er_data_len;
- }
-out:
- if (error || el == &el_this)
- brelse(el->el_bh);
- return error;
+ acl = posix_acl_from_xattr(data, len);
+ kfree(data);
+ return acl;
}
/**
@@ -153,12 +77,12 @@ out:
int gfs2_check_acl(struct inode *inode, int mask)
{
- struct posix_acl *acl = NULL;
+ struct posix_acl *acl;
int error;
- error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
- if (error)
- return error;
+ acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (acl) {
error = posix_acl_permission(inode, acl, mask);
@@ -169,58 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *dibh;
- int error;
+ int error = 0;
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_assert_withdraw(sdp,
- (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
- ip->i_inode.i_mode = mode;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ iattr.ia_valid = ATTR_MODE;
+ iattr.ia_mode = mode;
+
+ error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
}
- gfs2_trans_end(sdp);
+ return error;
+}
+
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+ int error;
+ int len;
+ char *data;
+ const char *name = gfs2_acl_name(type);
- return 0;
+ BUG_ON(name == NULL);
+ len = posix_acl_to_xattr(acl, NULL, 0);
+ if (len == 0)
+ return 0;
+ data = kmalloc(len, GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+ error = posix_acl_to_xattr(acl, data, len);
+ if (error < 0)
+ goto out;
+ error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+out:
+ kfree(data);
+ return error;
}
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct posix_acl *acl = NULL, *clone;
- struct gfs2_ea_request er;
- mode_t mode = ip->i_inode.i_mode;
- int error;
+ struct posix_acl *acl, *clone;
+ mode_t mode = inode->i_mode;
+ int error = 0;
if (!sdp->sd_args.ar_posix_acl)
return 0;
- if (S_ISLNK(ip->i_inode.i_mode))
+ if (S_ISLNK(inode->i_mode))
return 0;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = GFS2_EATYPE_SYS;
-
- error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
- &er.er_data, &er.er_data_len);
- if (error)
- return error;
+ acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (!acl) {
mode &= ~current_umask();
- if (mode != ip->i_inode.i_mode)
- error = munge_mode(ip, mode);
+ if (mode != inode->i_mode)
+ error = gfs2_set_mode(inode, mode);
return error;
}
+ if (S_ISDIR(inode->i_mode)) {
+ error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto out;
+ }
+
clone = posix_acl_clone(acl, GFP_NOFS);
error = -ENOMEM;
if (!clone)
@@ -228,46 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
posix_acl_release(acl);
acl = clone;
- if (S_ISDIR(ip->i_inode.i_mode)) {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- error = gfs2_system_eaops.eo_set(ip, &er);
- if (error)
- goto out;
- }
-
error = posix_acl_create_masq(acl, &mode);
if (error < 0)
goto out;
- if (error > 0) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
- er.er_mode = mode;
- er.er_flags = GFS2_ERF_MODE;
- error = gfs2_system_eaops.eo_set(ip, &er);
- if (error)
- goto out;
- } else
- munge_mode(ip, mode);
+ if (error == 0)
+ goto munge;
+ error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
+ if (error)
+ goto out;
+munge:
+ error = gfs2_set_mode(inode, mode);
out:
posix_acl_release(acl);
- kfree(er.er_data);
return error;
}
int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
{
- struct posix_acl *acl = NULL, *clone;
- struct gfs2_ea_location el;
+ struct posix_acl *acl, *clone;
char *data;
unsigned int len;
int error;
- error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
- if (error)
- return error;
+ acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (!acl)
return gfs2_setattr_simple(ip, attr);
@@ -280,14 +207,134 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
error = posix_acl_chmod_masq(acl, attr->ia_mode);
if (!error) {
+ len = posix_acl_to_xattr(acl, NULL, 0);
+ data = kmalloc(len, GFP_NOFS);
+ error = -ENOMEM;
+ if (data == NULL)
+ goto out;
posix_acl_to_xattr(acl, data, len);
- error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+ error = gfs2_xattr_acl_chmod(ip, attr, data);
+ kfree(data);
+ set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
}
out:
posix_acl_release(acl);
- brelse(el.el_bh);
- kfree(data);
return error;
}
+static int gfs2_acl_type(const char *name)
+{
+ if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+ return ACL_TYPE_ACCESS;
+ if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+ return ACL_TYPE_DEFAULT;
+ return -EINVAL;
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type;
+ int error;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+
+ acl = gfs2_acl_get(GFS2_I(inode), type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+
+ error = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct posix_acl *acl = NULL;
+ int error = 0, type;
+
+ if (!sdp->sd_args.ar_posix_acl)
+ return -EOPNOTSUPP;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+ if (flags & XATTR_CREATE)
+ return -EINVAL;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return value ? -EACCES : 0;
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!value)
+ goto set_acl;
+
+ acl = posix_acl_from_xattr(value, size);
+ if (!acl) {
+ /*
+ * acl_set_file(3) may request that we set default ACLs with
+ * zero length -- defend (gracefully) against that here.
+ */
+ goto out;
+ }
+ if (IS_ERR(acl)) {
+ error = PTR_ERR(acl);
+ goto out;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out_release;
+
+ error = -EINVAL;
+ if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+ goto out_release;
+
+ if (type == ACL_TYPE_ACCESS) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+
+ if (error <= 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+
+ if (error < 0)
+ return error;
+ }
+
+ error = gfs2_set_mode(inode, mode);
+ if (error)
+ goto out_release;
+ }
+
+set_acl:
+ error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+ if (!error) {
+ if (acl)
+ set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
+ }
+out_release:
+ posix_acl_release(acl);
+out:
+ return error;
+}
+
+struct xattr_handler gfs2_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = gfs2_xattr_system_get,
+ .set = gfs2_xattr_system_set,
+};
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..9306a2e6620c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
#include "incore.h"
#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN 16
#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN 17
+#define GFS2_ACL_MAX_ENTRIES 25
-#define GFS2_ACL_IS_ACCESS(name, len) \
- ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
- !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
-
-#define GFS2_ACL_IS_DEFAULT(name, len) \
- ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
- !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-
-struct gfs2_ea_request;
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..7b8da9415267 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int i;
int ret;
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
if (ret || (--(wbc->nr_to_write) <= 0))
ret = 1;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- ret = 1;
- }
-
}
gfs2_trans_end(sdp);
return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
static int gfs2_write_cache_jdata(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
int scanned = 0;
int range_whole = 0;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- return 0;
- }
-
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
mark_inode_dirty(inode);
}
- if (inode == sdp->sd_rindex)
+ if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
+ ip->i_gh.gh_flags |= GL_NOCACHE;
+ }
brelse(dibh);
gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
mark_inode_dirty(inode);
}
- if (inode == sdp->sd_rindex)
+ if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
+ ip->i_gh.gh_flags |= GL_NOCACHE;
+ }
brelse(dibh);
gfs2_trans_end(sdp);
@@ -1135,6 +1127,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1144,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1160,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
return 0;
}
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+ struct gfs2_inode *ginode;
+
+ if (!dentry->d_inode)
+ return 0;
+
+ ginode = GFS2_I(dentry->d_inode);
+ if (!ginode->i_iopen_gh.gh_gl)
+ return 0;
+
+ if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+ return 1;
+
+ return 0;
+}
+
const struct dentry_operations gfs2_dops = {
.d_revalidate = gfs2_drevalidate,
.d_hash = gfs2_dhash,
+ .d_delete = gfs2_dentry_delete,
};
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..25fddc100f18 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
return ERR_PTR(-EIO);
}
-
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
- struct gfs2_dirent **dent)
-{
- struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-
- if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
- if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
- return -EIO;
- *dent = (struct gfs2_dirent *)(bh->b_data +
- sizeof(struct gfs2_leaf));
- return IS_LEAF;
- } else {
- if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
- return -EIO;
- *dent = (struct gfs2_dirent *)(bh->b_data +
- sizeof(struct gfs2_dinode));
- return IS_DINODE;
- }
-}
-
static int dirent_check_reclen(struct gfs2_inode *dip,
const struct gfs2_dirent *d, const void *end_p)
{
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
divider = (start + half_len) << (32 - dip->i_depth);
/* Copy the entries */
- dirent_first(dip, obh, &dent);
+ dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
do {
next = dent;
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
- unsigned int type;
-
- if (strncmp(name, "system.", 7) == 0) {
- type = GFS2_EATYPE_SYS;
- if (truncated_name)
- *truncated_name = name + sizeof("system.") - 1;
- } else if (strncmp(name, "user.", 5) == 0) {
- type = GFS2_EATYPE_USR;
- if (truncated_name)
- *truncated_name = name + sizeof("user.") - 1;
- } else if (strncmp(name, "security.", 9) == 0) {
- type = GFS2_EATYPE_SECURITY;
- if (truncated_name)
- *truncated_name = name + sizeof("security.") - 1;
- } else {
- type = GFS2_EATYPE_UNUSED;
- if (truncated_name)
- *truncated_name = NULL;
- }
-
- return type;
-}
-
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
- !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
- (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
- GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
- return -EOPNOTSUPP;
-
- return gfs2_ea_get_i(ip, er);
-}
-
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- int remove = 0;
- int error;
-
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- if (!(er->er_flags & GFS2_ERF_MODE)) {
- er->er_mode = ip->i_inode.i_mode;
- er->er_flags |= GFS2_ERF_MODE;
- }
- error = gfs2_acl_validate_set(ip, 1, er,
- &remove, &er->er_mode);
- if (error)
- return error;
- error = gfs2_ea_set_i(ip, er);
- if (error)
- return error;
- if (remove)
- gfs2_ea_remove_i(ip, er);
- return 0;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- error = gfs2_acl_validate_set(ip, 0, er,
- &remove, NULL);
- if (error)
- return error;
- if (!remove)
- error = gfs2_ea_set_i(ip, er);
- else {
- error = gfs2_ea_remove_i(ip, er);
- if (error == -ENODATA)
- error = 0;
- }
- return error;
- }
-
- return -EPERM;
-}
-
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 1);
- if (error)
- return error;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 0);
- if (error)
- return error;
-
- } else
- return -EPERM;
-
- return gfs2_ea_remove_i(ip, er);
-}
-
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "user",
-};
-
-const struct gfs2_eattr_operations gfs2_system_eaops = {
- .eo_get = system_eo_get,
- .eo_set = system_eo_set,
- .eo_remove = system_eo_remove,
- .eo_name = "system",
-};
-
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "security",
-};
-
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
- NULL,
- &gfs2_user_eaops,
- &gfs2_system_eaops,
- &gfs2_security_eaops,
-};
-
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-
-struct gfs2_ea_request;
-struct gfs2_inode;
-
-struct gfs2_eattr_operations {
- int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- char *eo_name;
-};
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-
-#endif /* __EAOPS_DOT_H__ */
-
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
}
static struct dentry *gfs2_get_dentry(struct super_block *sb,
- struct gfs2_inum_host *inum)
+ struct gfs2_inum_host *inum)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- struct gfs2_holder i_gh, ri_gh, rgd_gh;
- struct gfs2_rgrpd *rgd;
+ struct gfs2_holder i_gh;
struct inode *inode;
struct dentry *dentry;
int error;
- /* System files? */
-
inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
if (error)
return ERR_PTR(error);
- error = gfs2_rindex_hold(sdp, &ri_gh);
+ error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
if (error)
goto fail;
- error = -EINVAL;
- rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
- if (!rgd)
- goto fail_rindex;
-
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
- if (error)
- goto fail_rindex;
-
- error = -ESTALE;
- if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
- goto fail_rgd;
-
- gfs2_glock_dq_uninit(&rgd_gh);
- gfs2_glock_dq_uninit(&ri_gh);
-
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
- inum->no_addr,
- 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto fail;
@@ -224,13 +203,6 @@ out_inode:
if (!IS_ERR(dentry))
dentry->d_op = &gfs2_dops;
return dentry;
-
-fail_rgd:
- gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
- gfs2_glock_dq_uninit(&ri_gh);
-
fail:
gfs2_glock_dq_uninit(&i_gh);
return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..4eb308aa3234 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
#include "rgrp.h"
#include "trans.h"
#include "util.h"
-#include "eaops.h"
/**
* gfs2_llseek - seek to a location in a file
@@ -419,7 +418,7 @@ out:
return ret;
}
-static struct vm_operations_struct gfs2_vm_ops = {
+static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = gfs2_page_mkwrite,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a55..a3f90ad2af80 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -672,12 +672,17 @@ out:
return;
out_sched:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ smp_mb__after_clear_bit();
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put_nolock(gl);
+ return;
+
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
- goto out;
+ smp_mb__after_clear_bit();
+ return;
}
static void delete_work_func(struct work_struct *work)
@@ -1375,10 +1380,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
handle_callback(gl, LM_ST_UNLOCKED, 0);
nr--;
}
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ smp_mb__after_clear_bit();
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put_nolock(gl);
spin_unlock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
spin_lock(&lru_lock);
continue;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d0..13f0bd228132 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
return gl->gl_state == LM_ST_SHARED;
}
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
-{
- int ret;
- spin_lock(&gl->gl_spin);
- ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- return ret;
-}
-
int gfs2_glock_get(struct gfs2_sbd *sdp,
u64 number, const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c39..78554acc0605 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
+#include <linux/posix_acl.h>
#include "gfs2.h"
#include "incore.h"
@@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (flags & DIO_METADATA) {
struct address_space *mapping = gl->gl_aspace->i_mapping;
truncate_inode_pages(mapping, 0);
- if (ip)
+ if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
+ forget_all_cached_acls(&ip->i_inode);
+ }
}
if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..4792200978c8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
#define GFS2_DATA_WRITEBACK 1
#define GFS2_DATA_ORDERED 2
+#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW 0
+#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
+#define GFS2_ERRORS_RO 2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC 3
+
struct gfs2_args {
char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -422,7 +428,12 @@ struct gfs2_args {
unsigned int ar_data:2; /* ordered/writeback */
unsigned int ar_meta:1; /* mount metafs */
unsigned int ar_discard:1; /* discard requests */
+ unsigned int ar_errors:2; /* errors=withdraw | panic */
+ unsigned int ar_nobarrier:1; /* do not send barriers */
int ar_commit; /* Commit interval */
+ int ar_statfs_quantum; /* The fast statfs interval */
+ int ar_quota_quantum; /* The quota interval */
+ int ar_statfs_percent; /* The % change to force sync */
};
struct gfs2_tune {
@@ -489,7 +500,6 @@ struct gfs2_sb_host {
*/
struct lm_lockstruct {
- u32 ls_id;
unsigned int ls_jid;
unsigned int ls_first;
unsigned int ls_first_done;
@@ -541,23 +551,18 @@ struct gfs2_sbd {
struct dentry *sd_root_dir;
struct inode *sd_jindex;
- struct inode *sd_inum_inode;
struct inode *sd_statfs_inode;
- struct inode *sd_ir_inode;
struct inode *sd_sc_inode;
struct inode *sd_qc_inode;
struct inode *sd_rindex;
struct inode *sd_quota_inode;
- /* Inum stuff */
-
- struct mutex sd_inum_mutex;
-
/* StatFS stuff */
spinlock_t sd_statfs_spin;
struct gfs2_statfs_change_host sd_statfs_master;
struct gfs2_statfs_change_host sd_statfs_local;
+ int sd_statfs_force_sync;
/* Resource group stuff */
@@ -580,7 +585,6 @@ struct gfs2_sbd {
struct gfs2_holder sd_journal_gh;
struct gfs2_holder sd_jinode_gh;
- struct gfs2_holder sd_ir_gh;
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..26ba2a4c4a2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
@@ -519,139 +519,6 @@ out:
return inode ? inode : ERR_PTR(error);
}
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
- const struct gfs2_inum_range *str = buf;
-
- ir->ir_start = be64_to_cpu(str->ir_start);
- ir->ir_length = be64_to_cpu(str->ir_length);
-}
-
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
- struct gfs2_inum_range *str = buf;
-
- str->ir_start = cpu_to_be64(ir->ir_start);
- str->ir_length = cpu_to_be64(ir->ir_length);
-}
-
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error) {
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return error;
- }
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (ir.ir_length) {
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir,
- bh->b_data + sizeof(struct gfs2_dinode));
- brelse(bh);
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return 0;
- }
-
- brelse(bh);
-
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-
- return 1;
-}
-
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
- struct gfs2_holder gh;
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (error)
- return error;
-
- error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
- if (error)
- goto out;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- goto out_end_trans;
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (!ir.ir_length) {
- struct buffer_head *m_bh;
- u64 x, y;
- __be64 z;
-
- error = gfs2_meta_inode_buffer(m_ip, &m_bh);
- if (error)
- goto out_brelse;
-
- z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
- x = y = be64_to_cpu(z);
- ir.ir_start = x;
- ir.ir_length = GFS2_INUM_QUANTUM;
- x += GFS2_INUM_QUANTUM;
- if (x < y)
- gfs2_consist_inode(m_ip);
- z = cpu_to_be64(x);
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
- *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-
- brelse(m_bh);
- }
-
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
-
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-out_brelse:
- brelse(bh);
-out_end_trans:
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-out:
- gfs2_glock_dq_uninit(&gh);
- return error;
-}
-
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
- int error;
-
- error = pick_formal_ino_1(sdp, inum);
- if (error <= 0)
- return error;
-
- error = pick_formal_ino_2(sdp, inum);
-
- return error;
-}
-
/**
* create_ok - OK to create a new on-disk inode here?
* @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
if (error)
goto out_ipreserv;
- *no_addr = gfs2_alloc_di(dip, generation);
+ error = gfs2_alloc_di(dip, no_addr, generation);
gfs2_trans_end(sdp);
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
size_t len;
void *value;
char *name;
- struct gfs2_ea_request er;
err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
&name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
return err;
}
- memset(&er, 0, sizeof(struct gfs2_ea_request));
-
- er.er_type = GFS2_EATYPE_SECURITY;
- er.er_name = name;
- er.er_data = value;
- er.er_name_len = strlen(name);
- er.er_data_len = len;
-
- err = gfs2_ea_set_i(ip, &er);
-
+ err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
kfree(value);
kfree(name);
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock;
- error = pick_formal_ino(sdp, &inum.no_formal_ino);
- if (error)
- goto fail_gunlock;
-
error = alloc_dinode(dip, &inum.no_addr, &generation);
if (error)
goto fail_gunlock;
+ inum.no_formal_ino = generation;
error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
- inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
- inum.no_addr,
- inum.no_formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+ inum.no_formal_ino, 0);
if (IS_ERR(inode))
goto fail_gunlock2;
@@ -1018,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
- error = gfs2_acl_create(dip, GFS2_I(inode));
+ error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock2;
@@ -1094,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
- str->di_header.__pad0 = 0;
str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
- str->di_header.__pad1 = 0;
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..4511b08fc451 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
memset(lh, 0, sizeof(struct gfs2_log_header));
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.__pad0 = cpu_to_be64(0);
lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
lh->lh_flags = cpu_to_be32(flags);
lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..de97632ba32f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
{
struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+ struct gfs2_meta_header *mh;
struct gfs2_trans *tr;
lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
gfs2_meta_check(sdp, bd->bd_bh);
gfs2_pin(sdp, bd->bd_bh);
+ mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+ mh->__pad0 = cpu_to_be64(0);
+ mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
sdp->sd_log_num_buf++;
list_add(&le->le_list, &sdp->sd_log_le_buf);
tr->tr_num_buf_new++;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..edfee24f3636 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
#include <linux/mount.h>
#include <linux/gfs2_ondisk.h>
#include <linux/slow-work.h>
+#include <linux/quotaops.h>
#include "gfs2.h"
#include "incore.h"
@@ -62,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_quota_warn_period = 10;
gt->gt_quota_scale_num = 1;
gt->gt_quota_scale_den = 1;
- gt->gt_quota_quantum = 60;
gt->gt_new_files_jdata = 0;
gt->gt_max_readahead = 1 << 18;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
- gt->gt_statfs_quantum = 30;
- gt->gt_statfs_slow = 0;
}
static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -84,7 +82,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
gfs2_tune_init(&sdp->sd_tune);
- mutex_init(&sdp->sd_inum_mutex);
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +830,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
if (error)
goto fail;
- /* Read in the master inode number inode */
- sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
- if (IS_ERR(sdp->sd_inum_inode)) {
- error = PTR_ERR(sdp->sd_inum_inode);
- fs_err(sdp, "can't read in inum inode: %d\n", error);
- goto fail_journal;
- }
-
-
/* Read in the master statfs inode */
sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail_inum;
+ goto fail_journal;
}
/* Read in the resource index inode */
@@ -876,8 +864,6 @@ fail_rindex:
iput(sdp->sd_rindex);
fail_statfs:
iput(sdp->sd_statfs_inode);
-fail_inum:
- iput(sdp->sd_inum_inode);
fail_journal:
init_journal(sdp, UNDO);
fail:
@@ -905,20 +891,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
return error;
}
- sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
- if (IS_ERR(sdp->sd_ir_inode)) {
- error = PTR_ERR(sdp->sd_ir_inode);
- fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
- goto fail;
- }
-
sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
if (IS_ERR(sdp->sd_sc_inode)) {
error = PTR_ERR(sdp->sd_sc_inode);
fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
- goto fail_ir_i;
+ goto fail;
}
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +910,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
iput(pn);
pn = NULL;
- ip = GFS2_I(sdp->sd_ir_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
- &sdp->sd_ir_gh);
- if (error) {
- fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
- goto fail_qc_i;
- }
-
ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_sc_gh);
if (error) {
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
- goto fail_ir_gh;
+ goto fail_qc_i;
}
ip = GFS2_I(sdp->sd_qc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +932,10 @@ fail_qc_gh:
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
fail_ut_gh:
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
fail_qc_i:
iput(sdp->sd_qc_inode);
fail_ut_i:
iput(sdp->sd_sc_inode);
-fail_ir_i:
- iput(sdp->sd_ir_inode);
fail:
if (pn)
iput(pn);
@@ -1063,7 +1026,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_ops = lm;
ls->ls_first = 1;
- ls->ls_id = 0;
for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1043,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_jid = option;
break;
case Opt_id:
- ret = match_int(&tmp[0], &option);
- if (ret)
- goto hostdata_error;
- ls->ls_id = option;
+ /* Obsolete, but left for backward compat purposes */
break;
case Opt_first:
ret = match_int(&tmp[0], &option);
@@ -1133,6 +1092,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
lm->lm_unmount(sdp);
}
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
/**
* fill_super - Read in superblock
* @sb: The VFS superblock
@@ -1142,7 +1112,7 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
* Returns: errno
*/
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
{
struct gfs2_sbd *sdp;
struct gfs2_holder mount_gh;
@@ -1153,16 +1123,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
return -ENOMEM;
}
-
- sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
- sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
- sdp->sd_args.ar_commit = 60;
-
- error = gfs2_mount_args(sdp, &sdp->sd_args, data);
- if (error) {
- printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
- goto fail;
- }
+ sdp->sd_args = *args;
if (sdp->sd_args.ar_spectator) {
sb->s_flags |= MS_RDONLY;
@@ -1170,10 +1131,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
}
if (sdp->sd_args.ar_posix_acl)
sb->s_flags |= MS_POSIXACL;
+ if (sdp->sd_args.ar_nobarrier)
+ set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_export_op = &gfs2_export_ops;
+ sb->s_xattr = gfs2_xattr_handlers;
+ sb->s_qcop = &gfs2_quotactl_ops;
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
sb->s_time_gran = 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1186,6 +1152,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+ sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+ if (sdp->sd_args.ar_statfs_quantum) {
+ sdp->sd_tune.gt_statfs_slow = 0;
+ sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+ }
+ else {
+ sdp->sd_tune.gt_statfs_slow = 1;
+ sdp->sd_tune.gt_statfs_quantum = 30;
+ }
error = init_names(sdp, silent);
if (error)
@@ -1236,7 +1211,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
}
gfs2_glock_dq_uninit(&mount_gh);
-
+ gfs2_online_uevent(sdp);
return 0;
fail_threads:
@@ -1269,18 +1244,127 @@ fail:
return error;
}
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data, struct vfsmount *mnt)
+static int set_gfs2_super(struct super_block *s, void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+ s->s_bdev = data;
+ s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+ return 0;
}
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
{
struct block_device *bdev = ptr;
return (bdev == s->s_bdev);
}
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ * of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ struct block_device *bdev;
+ struct super_block *s;
+ fmode_t mode = FMODE_READ;
+ int error;
+ struct gfs2_args args;
+ struct gfs2_sbd *sdp;
+
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = -EBUSY;
+ goto error_bdev;
+ }
+ s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = PTR_ERR(s);
+ if (IS_ERR(s))
+ goto error_bdev;
+
+ memset(&args, 0, sizeof(args));
+ args.ar_quota = GFS2_QUOTA_DEFAULT;
+ args.ar_data = GFS2_DATA_DEFAULT;
+ args.ar_commit = 60;
+ args.ar_statfs_quantum = 30;
+ args.ar_quota_quantum = 60;
+ args.ar_errors = GFS2_ERRORS_DEFAULT;
+
+ error = gfs2_mount_args(&args, data);
+ if (error) {
+ printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+ if (s->s_root)
+ goto error_super;
+ deactivate_locked_super(s);
+ return error;
+ }
+
+ if (s->s_root) {
+ error = -EBUSY;
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ goto error_super;
+ close_bdev_exclusive(bdev, mode);
+ } else {
+ char b[BDEVNAME_SIZE];
+
+ s->s_flags = flags;
+ s->s_mode = mode;
+ strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ sb_set_blocksize(s, block_size(bdev));
+ error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(s);
+ return error;
+ }
+ s->s_flags |= MS_ACTIVE;
+ bdev->bd_super = s;
+ }
+
+ sdp = s->s_fs_info;
+ mnt->mnt_sb = s;
+ if (args.ar_meta)
+ mnt->mnt_root = dget(sdp->sd_master_dir);
+ else
+ mnt->mnt_root = dget(sdp->sd_root_dir);
+ return 0;
+
+error_super:
+ deactivate_locked_super(s);
+error_bdev:
+ close_bdev_exclusive(bdev, mode);
+ return error;
+}
+
static int set_meta_super(struct super_block *s, void *ptr)
{
return -EINVAL;
@@ -1300,13 +1384,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
dev_name, error);
return error;
}
- s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+ s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
path.dentry->d_inode->i_sb->s_bdev);
path_put(&path);
if (IS_ERR(s)) {
printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
return PTR_ERR(s);
}
+ if ((flags ^ s->s_flags) & MS_RDONLY) {
+ deactivate_locked_super(s);
+ return -EBUSY;
+ }
sdp = s->s_fs_info;
mnt->mnt_sb = s;
mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/namei.h>
-#include <linux/utsname.h>
#include <linux/mm.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -26,8 +25,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -349,7 +347,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
if (error)
- goto out_rgrp;
+ goto out_gunlock;
error = gfs2_dir_del(dip, &dentry->d_name);
if (error)
@@ -1302,60 +1300,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
const void *data, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = (char *)data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
- er.er_flags = flags;
-
- gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_set(GFS2_I(inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_setxattr(dentry, name, data, size, flags);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
void *data, size_t size)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
-
- return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_data = (size) ? buffer : NULL;
- er.er_data_len = size;
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_getxattr(dentry, name, data, size);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_removexattr(struct dentry *dentry, const char *name)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_name_len = strlen(er.er_name);
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_removexattr(dentry, name);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc9..e3bf6eab8750 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
* fuzziness in the current usage value of IDs that are being used on different
* nodes in the cluster simultaneously. So, it is possible for a user on
* multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
* program to be run on node crashes or anything like that.
*
* There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
#include <linux/gfs2_ondisk.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
#include "gfs2.h"
#include "incore.h"
@@ -65,13 +67,6 @@
#define QUOTA_USER 1
#define QUOTA_GROUP 0
-struct gfs2_quota_host {
- u64 qu_limit;
- u64 qu_warn;
- s64 qu_value;
- u32 qu_ll_next;
-};
-
struct gfs2_quota_change_host {
u64 qc_change;
u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
return error;
}
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
spin_unlock(&qd_lru_lock);
- if (qd || !create) {
+ if (qd) {
if (new_qd) {
gfs2_glock_put(new_qd->qd_gl);
kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
qd_put(qd);
}
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data **qdp)
{
int error;
- error = qd_get(sdp, user, id, create, qdp);
+ error = qd_get(sdp, user, id, qdp);
if (error)
return error;
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
if (error)
goto out;
al->al_qd_num++;
qd++;
- error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
if (error)
goto out;
al->al_qd_num++;
qd++;
if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
- error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_USER, uid, qd);
if (error)
goto out;
al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
}
if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
- error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
if (error)
goto out;
al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
mutex_unlock(&sdp->sd_quota_mutex);
}
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
- const struct gfs2_quota *str = buf;
-
- qu->qu_limit = be64_to_cpu(str->qu_limit);
- qu->qu_warn = be64_to_cpu(str->qu_warn);
- qu->qu_value = be64_to_cpu(str->qu_value);
- qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
- struct gfs2_quota *str = buf;
-
- str->qu_limit = cpu_to_be64(qu->qu_limit);
- str->qu_warn = cpu_to_be64(qu->qu_warn);
- str->qu_value = cpu_to_be64(qu->qu_value);
- str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
- memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
-
/**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
*
* This function was mostly borrowed from gfs2_block_truncate_page which was
* in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
*/
+
static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
- s64 change, struct gfs2_quota_data *qd)
+ s64 change, struct gfs2_quota_data *qd,
+ struct fs_disk_quota *fdq)
{
struct inode *inode = &ip->i_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long index = loc >> PAGE_CACHE_SHIFT;
unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, pos;
- struct buffer_head *bh;
+ struct buffer_head *bh, *dibh;
struct page *page;
void *kaddr;
- char *ptr;
- struct gfs2_quota_host qp;
+ struct gfs2_quota *qp;
s64 value;
int err = -EIO;
+ u64 size;
if (gfs2_is_stuffed(ip))
gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
gfs2_trans_add_bh(ip->i_gl, bh, 0);
kaddr = kmap_atomic(page, KM_USER0);
- ptr = kaddr + offset;
- gfs2_quota_in(&qp, ptr);
- qp.qu_value += change;
- value = qp.qu_value;
- gfs2_quota_out(&qp, ptr);
+ qp = kaddr + offset;
+ value = (s64)be64_to_cpu(qp->qu_value) + change;
+ qp->qu_value = cpu_to_be64(value);
+ qd->qd_qb.qb_value = qp->qu_value;
+ if (fdq) {
+ if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+ qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+ qd->qd_qb.qb_warn = qp->qu_warn;
+ }
+ if (fdq->d_fieldmask & FS_DQ_BHARD) {
+ qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+ qd->qd_qb.qb_limit = qp->qu_limit;
+ }
+ }
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
- err = 0;
- qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
- qd->qd_qb.qb_value = cpu_to_be64(value);
- ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
- ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+
+ err = gfs2_meta_inode_buffer(ip, &dibh);
+ if (err)
+ goto unlock;
+
+ size = loc + sizeof(struct gfs2_quota);
+ if (size > inode->i_size) {
+ ip->i_disksize = size;
+ i_size_write(inode, size);
+ }
+ inode->i_mtime = inode->i_atime = CURRENT_TIME;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ mark_inode_dirty(inode);
+
unlock:
unlock_page(page);
page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
return -ENOMEM;
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+ mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
for (qx = 0; qx < num_qd; qx++) {
- error = gfs2_glock_nq_init(qda[qx]->qd_gl,
- LM_ST_EXCLUSIVE,
+ error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
if (error)
goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
for (x = 0; x < num_qd; x++) {
qd = qda[x];
offset = qd2offset(qd);
- error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
- (struct gfs2_quota_data *)
- qd);
+ error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
if (error)
goto out_end_trans;
@@ -817,21 +818,44 @@ out_gunlock:
out:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
+ mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
return error;
}
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_quota q;
+ struct gfs2_quota_lvb *qlvb;
+ loff_t pos;
+ int error;
+
+ memset(&q, 0, sizeof(struct gfs2_quota));
+ pos = qd2offset(qd);
+ error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+ if (error < 0)
+ return error;
+
+ qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+ qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+ qlvb->__pad = 0;
+ qlvb->qb_limit = q.qu_limit;
+ qlvb->qb_warn = q.qu_warn;
+ qlvb->qb_value = q.qu_value;
+ qd->qd_qb = *qlvb;
+
+ return 0;
+}
+
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
- struct gfs2_quota_host q;
- char buf[sizeof(struct gfs2_quota)];
int error;
- struct gfs2_quota_lvb *qlvb;
restart:
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
- loff_t pos;
gfs2_glock_dq_uninit(q_gh);
- error = gfs2_glock_nq_init(qd->qd_gl,
- LM_ST_EXCLUSIVE, GL_NOCACHE,
- q_gh);
+ error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, q_gh);
if (error)
return error;
@@ -853,29 +875,14 @@ restart:
if (error)
goto fail;
- memset(buf, 0, sizeof(struct gfs2_quota));
- pos = qd2offset(qd);
- error = gfs2_internal_read(ip, NULL, buf, &pos,
- sizeof(struct gfs2_quota));
- if (error < 0)
+ error = update_qd(sdp, qd);
+ if (error)
goto fail_gunlock;
gfs2_glock_dq_uninit(&i_gh);
-
- gfs2_quota_in(&q, buf);
- qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
- qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
- qlvb->__pad = 0;
- qlvb->qb_limit = cpu_to_be64(q.qu_limit);
- qlvb->qb_warn = cpu_to_be64(q.qu_warn);
- qlvb->qb_value = cpu_to_be64(q.qu_value);
- qd->qd_qb = *qlvb;
-
- if (gfs2_glock_is_blocking(qd->qd_gl)) {
- gfs2_glock_dq_uninit(q_gh);
- force_refresh = 0;
- goto restart;
- }
+ gfs2_glock_dq_uninit(q_gh);
+ force_refresh = 0;
+ goto restart;
}
return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
- printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+ printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
sdp->sd_fsname, type,
(test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
print_message(qd, "exceeded");
+ quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+ USRQUOTA : GRPQUOTA, qd->qd_id,
+ sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+
error = -EDQUOT;
break;
} else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
time_after_eq(jiffies, qd->qd_last_warn +
gfs2_tune_get(sdp,
gt_quota_warn_period) * HZ)) {
+ quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+ USRQUOTA : GRPQUOTA, qd->qd_id,
+ sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
error = print_message(qd, "warning");
qd->qd_last_warn = jiffies;
}
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
}
}
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type)
{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
unsigned int num_qd;
@@ -1118,7 +1133,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
struct gfs2_holder q_gh;
int error;
- error = qd_get(sdp, user, id, CREATE, &qd);
+ error = qd_get(sdp, user, id, &qd);
if (error)
return error;
@@ -1127,7 +1142,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
gfs2_glock_dq_uninit(&q_gh);
qd_put(qd);
-
return error;
}
@@ -1298,12 +1312,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
}
static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
- int (*fxn)(struct gfs2_sbd *sdp),
+ int (*fxn)(struct super_block *sb, int type),
unsigned long t, unsigned long *timeo,
unsigned int *new_timeo)
{
if (t >= *timeo) {
- int error = fxn(sdp);
+ int error = fxn(sdp->sd_vfs, 0);
quotad_error(sdp, msg, error);
*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
} else {
@@ -1330,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
}
}
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+ if (!sdp->sd_statfs_force_sync) {
+ sdp->sd_statfs_force_sync = 1;
+ wake_up(&sdp->sd_quota_wait);
+ }
+}
+
+
/**
* gfs2_quotad - Write cached quota changes into the quota file
* @sdp: Pointer to GFS2 superblock
@@ -1349,8 +1371,15 @@ int gfs2_quotad(void *data)
while (!kthread_should_stop()) {
/* Update the master statfs file */
- quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
- &statfs_timeo, &tune->gt_statfs_quantum);
+ if (sdp->sd_statfs_force_sync) {
+ int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+ quotad_error(sdp, "statfs", error);
+ statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+ }
+ else
+ quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+ &statfs_timeo,
+ &tune->gt_statfs_quantum);
/* Update quota file */
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
@@ -1367,7 +1396,7 @@ int gfs2_quotad(void *data)
spin_lock(&sdp->sd_trunc_lock);
empty = list_empty(&sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
- if (empty)
+ if (empty && !sdp->sd_statfs_force_sync)
t -= schedule_timeout(t);
else
t = 0;
@@ -1377,3 +1406,181 @@ int gfs2_quotad(void *data)
return 0;
}
+static int gfs2_quota_get_xstate(struct super_block *sb,
+ struct fs_quota_stat *fqs)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ memset(fqs, 0, sizeof(struct fs_quota_stat));
+ fqs->qs_version = FS_QSTAT_VERSION;
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+ fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+ else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+ fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+ if (sdp->sd_quota_inode) {
+ fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+ fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+ }
+ fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+ fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+ fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+ return 0;
+}
+
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_quota_lvb *qlvb;
+ struct gfs2_quota_data *qd;
+ struct gfs2_holder q_gh;
+ int error;
+
+ memset(fdq, 0, sizeof(struct fs_disk_quota));
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return -ESRCH; /* Crazy XFS error code */
+
+ if (type == USRQUOTA)
+ type = QUOTA_USER;
+ else if (type == GRPQUOTA)
+ type = QUOTA_GROUP;
+ else
+ return -EINVAL;
+
+ error = qd_get(sdp, type, id, &qd);
+ if (error)
+ return error;
+ error = do_glock(qd, FORCE, &q_gh);
+ if (error)
+ goto out;
+
+ qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+ fdq->d_version = FS_DQUOT_VERSION;
+ fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+ fdq->d_id = id;
+ fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+ fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+ fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+
+ gfs2_glock_dq_uninit(&q_gh);
+out:
+ qd_put(qd);
+ return error;
+}
+
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_quota_data *qd;
+ struct gfs2_holder q_gh, i_gh;
+ unsigned int data_blocks, ind_blocks;
+ unsigned int blocks = 0;
+ int alloc_required;
+ struct gfs2_alloc *al;
+ loff_t offset;
+ int error;
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return -ESRCH; /* Crazy XFS error code */
+
+ switch(type) {
+ case USRQUOTA:
+ type = QUOTA_USER;
+ if (fdq->d_flags != XFS_USER_QUOTA)
+ return -EINVAL;
+ break;
+ case GRPQUOTA:
+ type = QUOTA_GROUP;
+ if (fdq->d_flags != XFS_GROUP_QUOTA)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+ return -EINVAL;
+ if (fdq->d_id != id)
+ return -EINVAL;
+
+ error = qd_get(sdp, type, id, &qd);
+ if (error)
+ return error;
+
+ mutex_lock(&ip->i_inode.i_mutex);
+ error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+ if (error)
+ goto out_put;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ goto out_q;
+
+ /* Check for existing entry, if none then alloc new blocks */
+ error = update_qd(sdp, qd);
+ if (error)
+ goto out_i;
+
+ /* If nothing has changed, this is a no-op */
+ if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+ (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+ fdq->d_fieldmask ^= FS_DQ_BSOFT;
+ if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+ (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+ fdq->d_fieldmask ^= FS_DQ_BHARD;
+ if (fdq->d_fieldmask == 0)
+ goto out_i;
+
+ offset = qd2offset(qd);
+ error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+ &alloc_required);
+ if (error)
+ goto out_i;
+ if (alloc_required) {
+ al = gfs2_alloc_get(ip);
+ if (al == NULL)
+ goto out_i;
+ gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+ &data_blocks, &ind_blocks);
+ blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_alloc;
+ }
+
+ error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+ if (error)
+ goto out_release;
+
+ /* Apply changes */
+ error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+
+ gfs2_trans_end(sdp);
+out_release:
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ }
+out_i:
+ gfs2_glock_dq_uninit(&i_gh);
+out_q:
+ gfs2_glock_dq_uninit(&q_gh);
+out_put:
+ mutex_unlock(&ip->i_inode.i_mutex);
+ qd_put(qd);
+ return error;
+}
+
+const struct quotactl_ops gfs2_quotactl_ops = {
+ .quota_sync = gfs2_quota_sync,
+ .get_xstate = gfs2_quota_get_xstate,
+ .get_xquota = gfs2_xquota_get,
+ .set_xquota = gfs2_xquota_set,
+};
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e8..e271fa07ad02 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type);
extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
extern int gfs2_quotad(void *data);
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+
static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
}
extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d3..e594d9e544f3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -409,7 +409,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
memset(lh, 0, sizeof(struct gfs2_log_header));
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.__pad0 = cpu_to_be64(0);
lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
lh->lh_blkno = cpu_to_be32(lblock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..0608f490c295 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
* always aligned to a 64 bit boundary.
*
* The size of the buffer is in bytes, but is it assumed that it is
- * always ok to to read a complete multiple of 64 bits at the end
+ * always ok to read a complete multiple of 64 bits at the end
* of the block in case the end is no aligned to a natural boundary.
*
* Return: the block number (bitmap buffer scope) that was found
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
goto start_new_extent;
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
- nr_sects, GFP_NOFS);
+ nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
}
}
if (nr_sects) {
- rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+ rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
}
@@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
* Returns: The block type (GFS2_BLKST_*)
*/
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
{
struct gfs2_bitmap *bi = NULL;
u32 length, rgrp_block, buf_block;
@@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
return 0;
}
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+ (unsigned long long)rgd->rd_addr);
+ fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+ gfs2_rgrp_dump(NULL, rgd->rd_gl);
+ rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
/**
* gfs2_alloc_block - Allocate one or more blocks
* @ip: the inode to allocate the block for
@@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
return 0;
rgrp_error:
- fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
- (unsigned long long)rgd->rd_addr);
- fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
- gfs2_rgrp_dump(NULL, rgd->rd_gl);
- rgd->rd_flags |= GFS2_RDF_ERROR;
+ gfs2_rgrp_error(rgd);
return -EIO;
}
/**
* gfs2_alloc_di - Allocate a dinode
* @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
*
- * Returns: the block allocated
+ * Returns: 0 on success or error
*/
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
blk = rgblk_search(rgd, rgd->rd_last_alloc,
GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
- BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc = blk;
+ /* Since all blocks are reserved in advance, this shouldn't happen */
+ if (blk == BFITNOENT)
+ goto rgrp_error;
+ rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
+ if (rgd->rd_free == 0)
+ goto rgrp_error;
- gfs2_assert_withdraw(sdp, rgd->rd_free);
rgd->rd_free--;
rgd->rd_dinodes++;
*generation = rgd->rd_igeneration++;
+ if (*generation == 0)
+ *generation = rgd->rd_igeneration++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
rgd->rd_free_clone--;
spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
- return block;
+ *bn = block;
+ return 0;
+
+rgrp_error:
+ gfs2_rgrp_error(rgd);
+ return -EIO;
}
/**
@@ -1676,6 +1696,52 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
}
/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ * -ESTALE if it doesn't match
+ * or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder ri_gh, rgd_gh;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+ int ri_locked = 0;
+ int error;
+
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto fail;
+ ri_locked = 1;
+ }
+
+ error = -EINVAL;
+ rgd = gfs2_blk2rgrpd(sdp, no_addr);
+ if (!rgd)
+ goto fail_rindex;
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+ if (error)
+ goto fail_rindex;
+
+ if (gfs2_get_block_type(rgd, no_addr) != type)
+ error = -ESTALE;
+
+ gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+ if (ri_locked)
+ gfs2_glock_dq_uninit(&ri_gh);
+fail:
+ return error;
+}
+
+/**
* gfs2_rlist_add - add a RG to a list of RGs
* @sdp: the filesystem
* @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+ unsigned int type);
struct gfs2_rgrp_list {
unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..c282ad41f3d1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
#include "trans.h"
#include "util.h"
#include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
@@ -68,6 +68,13 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_commit,
+ Opt_err_withdraw,
+ Opt_err_panic,
+ Opt_statfs_quantum,
+ Opt_statfs_percent,
+ Opt_quota_quantum,
+ Opt_barrier,
+ Opt_nobarrier,
Opt_error,
};
@@ -97,18 +104,25 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_commit, "commit=%d"},
+ {Opt_err_withdraw, "errors=withdraw"},
+ {Opt_err_panic, "errors=panic"},
+ {Opt_statfs_quantum, "statfs_quantum=%d"},
+ {Opt_statfs_percent, "statfs_percent=%d"},
+ {Opt_quota_quantum, "quota_quantum=%d"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_error, NULL}
};
/**
* gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
+ * @args: The structure into which the parsed options will be written
+ * @options: The options to parse
*
* Return: errno
*/
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
{
char *o;
int token;
@@ -152,6 +166,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
args->ar_localcaching = 1;
break;
case Opt_debug:
+ if (args->ar_errors == GFS2_ERRORS_PANIC) {
+ printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
args->ar_debug = 1;
break;
case Opt_nodebug:
@@ -201,13 +220,52 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
case Opt_commit:
rv = match_int(&tmp[0], &args->ar_commit);
if (rv || args->ar_commit <= 0) {
- fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+ printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
return rv ? rv : -EINVAL;
}
break;
+ case Opt_statfs_quantum:
+ rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+ if (rv || args->ar_statfs_quantum < 0) {
+ printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_quota_quantum:
+ rv = match_int(&tmp[0], &args->ar_quota_quantum);
+ if (rv || args->ar_quota_quantum <= 0) {
+ printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_statfs_percent:
+ rv = match_int(&tmp[0], &args->ar_statfs_percent);
+ if (rv || args->ar_statfs_percent < 0 ||
+ args->ar_statfs_percent > 100) {
+ printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_err_withdraw:
+ args->ar_errors = GFS2_ERRORS_WITHDRAW;
+ break;
+ case Opt_err_panic:
+ if (args->ar_debug) {
+ printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
+ args->ar_errors = GFS2_ERRORS_PANIC;
+ break;
+ case Opt_barrier:
+ args->ar_nobarrier = 0;
+ break;
+ case Opt_nobarrier:
+ args->ar_nobarrier = 1;
+ break;
case Opt_error:
default:
- fs_info(sdp, "invalid mount option: %s\n", o);
+ printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
return -EINVAL;
}
}
@@ -422,7 +480,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
{
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+ struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct buffer_head *l_bh;
+ s64 x, y;
+ int need_sync = 0;
int error;
error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -436,9 +497,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
l_sc->sc_free += free;
l_sc->sc_dinodes += dinodes;
gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+ if (sdp->sd_args.ar_statfs_percent) {
+ x = 100 * l_sc->sc_free;
+ y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+ if (x >= y || x <= -y)
+ need_sync = 1;
+ }
spin_unlock(&sdp->sd_statfs_spin);
brelse(l_bh);
+ if (need_sync)
+ gfs2_wake_up_statfs(sdp);
}
void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -464,8 +533,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
}
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -501,6 +571,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
goto out_bh2;
update_statfs(sdp, m_bh, l_bh);
+ sdp->sd_statfs_force_sync = 0;
gfs2_trans_end(sdp);
@@ -692,8 +763,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
int error;
flush_workqueue(gfs2_delete_workqueue);
- gfs2_quota_sync(sdp);
- gfs2_statfs_sync(sdp);
+ gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
&t_gh);
@@ -768,7 +839,6 @@ restart:
/* Release stuff */
iput(sdp->sd_jindex);
- iput(sdp->sd_inum_inode);
iput(sdp->sd_statfs_inode);
iput(sdp->sd_rindex);
iput(sdp->sd_quota_inode);
@@ -779,10 +849,8 @@ restart:
if (!sdp->sd_args.ar_spectator) {
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
- iput(sdp->sd_ir_inode);
iput(sdp->sd_sc_inode);
iput(sdp->sd_qc_inode);
}
@@ -1044,8 +1112,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
spin_lock(&gt->gt_spin);
args.ar_commit = gt->gt_log_flush_secs;
+ args.ar_quota_quantum = gt->gt_quota_quantum;
+ if (gt->gt_statfs_slow)
+ args.ar_statfs_quantum = 0;
+ else
+ args.ar_statfs_quantum = gt->gt_statfs_quantum;
spin_unlock(&gt->gt_spin);
- error = gfs2_mount_args(sdp, &args, data);
+ error = gfs2_mount_args(&args, data);
if (error)
return error;
@@ -1080,10 +1153,24 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
sb->s_flags |= MS_POSIXACL;
else
sb->s_flags &= ~MS_POSIXACL;
+ if (sdp->sd_args.ar_nobarrier)
+ set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+ else
+ clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
spin_lock(&gt->gt_spin);
gt->gt_log_flush_secs = args.ar_commit;
+ gt->gt_quota_quantum = args.ar_quota_quantum;
+ if (args.ar_statfs_quantum) {
+ gt->gt_statfs_slow = 0;
+ gt->gt_statfs_quantum = args.ar_statfs_quantum;
+ }
+ else {
+ gt->gt_statfs_slow = 1;
+ gt->gt_statfs_quantum = 30;
+ }
spin_unlock(&gt->gt_spin);
+ gfs2_online_uevent(sdp);
return 0;
}
@@ -1161,7 +1248,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
{
struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
struct gfs2_args *args = &sdp->sd_args;
- int lfsecs;
+ int val;
if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
seq_printf(s, ",meta");
@@ -1222,9 +1309,36 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
}
if (args->ar_discard)
seq_printf(s, ",discard");
- lfsecs = sdp->sd_tune.gt_log_flush_secs;
- if (lfsecs != 60)
- seq_printf(s, ",commit=%d", lfsecs);
+ val = sdp->sd_tune.gt_log_flush_secs;
+ if (val != 60)
+ seq_printf(s, ",commit=%d", val);
+ val = sdp->sd_tune.gt_statfs_quantum;
+ if (val != 30)
+ seq_printf(s, ",statfs_quantum=%d", val);
+ val = sdp->sd_tune.gt_quota_quantum;
+ if (val != 60)
+ seq_printf(s, ",quota_quantum=%d", val);
+ if (args->ar_statfs_percent)
+ seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
+ if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+ const char *state;
+
+ switch (args->ar_errors) {
+ case GFS2_ERRORS_WITHDRAW:
+ state = "withdraw";
+ break;
+ case GFS2_ERRORS_PANIC:
+ state = "panic";
+ break;
+ default:
+ state = "unknown";
+ break;
+ }
+ seq_printf(s, ",errors=%s", state);
+ }
+ if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+ seq_printf(s, ",nobarrier");
+
return 0;
}
@@ -1252,6 +1366,10 @@ static void gfs2_delete_inode(struct inode *inode)
goto out;
}
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
+
gfs2_glock_dq_wait(&ip->i_iopen_gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..3df60f2d84e3 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,9 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
s64 dinodes);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
const void *buf);
extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..c5dad1eb7b91 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <asm/uaccess.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
#include "gfs2.h"
#include "incore.h"
@@ -157,7 +158,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_statfs_sync(sdp);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
return len;
}
@@ -170,13 +171,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_quota_sync(sdp);
+ gfs2_quota_sync(sdp->sd_vfs, 0);
return len;
}
static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
@@ -184,13 +186,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- gfs2_quota_refresh(sdp, 1, id);
- return len;
+ error = gfs2_quota_refresh(sdp, 1, id);
+ return error ? error : len;
}
static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
@@ -198,8 +201,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- gfs2_quota_refresh(sdp, 0, id);
- return len;
+ error = gfs2_quota_refresh(sdp, 0, id);
+ return error ? error : len;
}
static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
@@ -319,12 +322,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
return ret;
}
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%u\n", ls->ls_id);
-}
-
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -389,7 +386,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
GDLM_ATTR(block, 0644, block_show, block_store);
GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
-GDLM_ATTR(id, 0444, lkid_show, NULL);
GDLM_ATTR(jid, 0444, jid_show, NULL);
GDLM_ATTR(first, 0444, lkfirst_show, NULL);
GDLM_ATTR(first_done, 0444, first_done_show, NULL);
@@ -401,7 +397,6 @@ static struct attribute *lock_module_attrs[] = {
&gdlm_attr_proto_name.attr,
&gdlm_attr_block.attr,
&gdlm_attr_withdraw.attr,
- &gdlm_attr_id.attr,
&gdlm_attr_jid.attr,
&gdlm_attr_first.attr,
&gdlm_attr_first_done.attr,
@@ -519,7 +514,14 @@ static struct attribute_group lock_module_group = {
int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
{
+ struct super_block *sb = sdp->sd_vfs;
int error;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
sdp->sd_kobj.kset = gfs2_kset;
error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +537,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
if (error)
goto fail_tune;
- kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+ error = sysfs_create_link(&sdp->sd_kobj,
+ &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+ "device");
+ if (error)
+ goto fail_lock_module;
+
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
return 0;
+fail_lock_module:
+ sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
fail_tune:
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
fail_reg:
@@ -549,12 +559,12 @@ fail:
void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
{
+ sysfs_remove_link(&sdp->sd_kobj, "device");
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
kobject_put(&sdp->sd_kobj);
}
-
static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env)
{
@@ -563,6 +573,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+ if (!sdp->sd_args.ar_spectator)
+ add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
if (gfs2_uuid_valid(uuid)) {
add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
"%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +590,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
.uevent = gfs2_uevent,
};
-
int gfs2_sys_init(void)
{
gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
const struct lm_lockops *lm = ls->ls_ops;
va_list args;
- if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+ test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return 0;
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
- fs_err(sdp, "about to withdraw this file system\n");
- BUG_ON(sdp->sd_args.ar_debug);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+ fs_err(sdp, "about to withdraw this file system\n");
+ BUG_ON(sdp->sd_args.ar_debug);
- kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+ kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
- if (lm->lm_unmount) {
- fs_err(sdp, "telling LM to unmount\n");
- lm->lm_unmount(sdp);
+ if (lm->lm_unmount) {
+ fs_err(sdp, "telling LM to unmount\n");
+ lm->lm_unmount(sdp);
+ }
+ fs_err(sdp, "withdrawn\n");
+ dump_stack();
}
- fs_err(sdp, "withdrawn\n");
- dump_stack();
+
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
return -1;
}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
gfs2_tune_get(sdp, gt_complain_secs) * HZ))
return -2;
- printk(KERN_WARNING
- "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+ printk(KERN_WARNING
+ "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
if (sdp->sd_args.ar_debug)
BUG();
else
dump_stack();
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
+
sdp->sd_last_warning = jiffies;
return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..912f5cbc4740 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -38,26 +37,32 @@
* Returns: 1 if the EA should be stuffed
*/
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
unsigned int *size)
{
- *size = GFS2_EAREQ_SIZE_STUFFED(er);
- if (*size <= sdp->sd_jbsize)
+ unsigned int jbsize = sdp->sd_jbsize;
+
+ /* Stuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+ if (*size <= jbsize)
return 1;
- *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+ /* Unstuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+ (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
return 0;
}
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
{
unsigned int size;
- if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+ if (dsize > GFS2_EA_MAX_DATA_LEN)
return -ERANGE;
- ea_calc_size(sdp, er, &size);
+ ea_calc_size(sdp, nsize, dsize, &size);
/* This can only happen with 512 byte blocks */
if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
}
struct ea_find {
- struct gfs2_ea_request *ef_er;
+ int type;
+ const char *name;
+ size_t namel;
struct gfs2_ea_location *ef_el;
};
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
void *private)
{
struct ea_find *ef = private;
- struct gfs2_ea_request *er = ef->ef_er;
if (ea->ea_type == GFS2_EATYPE_UNUSED)
return 0;
- if (ea->ea_type == er->er_type) {
- if (ea->ea_name_len == er->er_name_len &&
- !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+ if (ea->ea_type == ef->type) {
+ if (ea->ea_name_len == ef->namel &&
+ !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
struct gfs2_ea_location *el = ef->ef_el;
get_bh(bh);
el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
return 0;
}
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
- struct gfs2_ea_location *el)
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+ struct gfs2_ea_location *el)
{
struct ea_find ef;
int error;
- ef.ef_er = er;
+ ef.type = type;
+ ef.name = name;
+ ef.namel = strlen(name);
ef.ef_el = el;
memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
unsigned int ei_size;
};
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ return 5 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SYS:
+ return 7 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SECURITY:
+ return 9 + ea->ea_name_len + 1;
+ default:
+ return 0;
+ }
+}
+
static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
}
/**
- * gfs2_ea_list -
- * @ip:
- * @er:
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_ea_request er;
struct gfs2_holder i_gh;
int error;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ if (size) {
+ er.er_data = buffer;
+ er.er_data_len = size;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
return error;
if (ip->i_eattr) {
- struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+ struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
error = ea_foreach(ip, ea_list_i, &ei);
if (!error)
@@ -490,80 +516,87 @@ out:
return error;
}
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- char *data)
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data, size_t size)
{
+ int ret;
+ size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+ if (len > size)
+ return -ERANGE;
+
if (GFS2_EA_IS_STUFFED(el->el_ea)) {
- memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
- return 0;
- } else
- return ea_get_unstuffed(ip, el->el_ea, data);
+ memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+ return len;
+ }
+ ret = ea_get_unstuffed(ip, el->el_ea, data);
+ if (ret < 0)
+ return ret;
+ return len;
}
-/**
- * gfs2_ea_get_i -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
{
struct gfs2_ea_location el;
int error;
+ int len;
+ char *data;
- if (!ip->i_eattr)
- return -ENODATA;
-
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
if (error)
return error;
if (!el.el_ea)
- return -ENODATA;
+ goto out;
+ if (!GFS2_EA_DATA_LEN(el.el_ea))
+ goto out;
- if (er->er_data_len) {
- if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
- error = -ERANGE;
- else
- error = gfs2_ea_get_copy(ip, &el, er->er_data);
- }
- if (!error)
- error = GFS2_EA_DATA_LEN(el.el_ea);
+ len = GFS2_EA_DATA_LEN(el.el_ea);
+ data = kmalloc(len, GFP_NOFS);
+ error = -ENOMEM;
+ if (data == NULL)
+ goto out;
+ error = gfs2_ea_get_copy(ip, &el, data, len);
+ if (error == 0)
+ error = len;
+ *ppdata = data;
+out:
brelse(el.el_bh);
-
return error;
}
/**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size)
{
- struct gfs2_holder i_gh;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_ea_location el;
int error;
- if (!er->er_name_len ||
- er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ if (!ip->i_eattr)
+ return -ENODATA;
+ if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
-
- error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
+ if (!el.el_ea)
+ return -ENODATA;
+ if (size)
+ error = gfs2_ea_get_copy(ip, &el, buffer, size);
+ else
+ error = GFS2_EA_DATA_LEN(el.el_ea);
+ brelse(el.el_bh);
return error;
}
@@ -713,12 +746,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) ==
- (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +789,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
* Returns: errno
*/
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+ const void *data, size_t size)
{
+ struct gfs2_ea_request er;
unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
unsigned int blks = 1;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+ er.er_type = type;
+ er.er_name = name;
+ er.er_name_len = strlen(name);
+ er.er_data = (void *)data;
+ er.er_data_len = size;
+
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+ return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
}
static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +883,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto out;
-
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +923,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
int stuffed;
int error;
- stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+ stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+ es->es_er->er_data_len, &size);
if (ea->ea_type == GFS2_EATYPE_UNUSED) {
if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1035,22 @@ out:
return error;
}
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
- struct gfs2_ea_location *el)
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+ const void *value, size_t size, struct gfs2_ea_location *el)
{
+ struct gfs2_ea_request er;
struct ea_set es;
unsigned int blks = 2;
int error;
+ er.er_type = type;
+ er.er_name = name;
+ er.er_data = (void *)value;
+ er.er_name_len = strlen(name);
+ er.er_data_len = size;
+
memset(&es, 0, sizeof(struct ea_set));
- es.es_er = er;
+ es.es_er = &er;
es.es_el = el;
error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1061,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
blks++;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+ return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
}
static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1076,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
GFS2_EA2NEXT(el->el_prev) == el->el_ea);
}
- return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
-}
-
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_ea_location el;
- int error;
-
- if (!ip->i_eattr) {
- if (er->er_flags & XATTR_REPLACE)
- return -ENODATA;
- return ea_init(ip, er);
- }
-
- error = gfs2_ea_find(ip, er, &el);
- if (error)
- return error;
-
- if (el.el_ea) {
- if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
- brelse(el.el_bh);
- return -EPERM;
- }
-
- error = -EEXIST;
- if (!(er->er_flags & XATTR_CREATE)) {
- int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
- error = ea_set_i(ip, er, &el);
- if (!error && unstuffed)
- ea_set_remove_unstuffed(ip, &el);
- }
-
- brelse(el.el_bh);
- } else {
- error = -ENODATA;
- if (!(er->er_flags & XATTR_REPLACE))
- error = ea_set_i(ip, er, NULL);
- }
-
- return error;
-}
-
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_holder i_gh;
- int error;
-
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
- error = ea_check_size(GFS2_SB(&ip->i_inode), er);
- if (error)
- return error;
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
- if (error)
- return error;
-
- if (IS_IMMUTABLE(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
+ return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
}
static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1100,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
if (GFS2_EA_IS_LAST(ea))
prev->ea_flags |= GFS2_EAFLAG_LAST;
- } else
+ } else {
ea->ea_type = GFS2_EATYPE_UNUSED;
+ }
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
@@ -1147,15 +1117,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
return error;
}
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
int error;
if (!ip->i_eattr)
return -ENODATA;
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
if (!el.el_ea)
@@ -1164,8 +1148,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
if (GFS2_EA_IS_STUFFED(el.el_ea))
error = ea_remove_stuffed(ip, &el);
else
- error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
- 0);
+ error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
brelse(el.el_bh);
@@ -1173,31 +1156,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
}
/**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
- * @ip: pointer to the inode of the target file
- * @er: request information
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
*
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
*/
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags)
{
- struct gfs2_holder i_gh;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_ea_location el;
+ unsigned int namel = strlen(name);
int error;
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+ if (namel > GFS2_EA_MAX_NAME_LEN)
+ return -ERANGE;
+
+ if (value == NULL)
+ return gfs2_xattr_remove(inode, type, name);
+
+ if (ea_check_size(sdp, namel, size))
+ return -ERANGE;
+
+ if (!ip->i_eattr) {
+ if (flags & XATTR_REPLACE)
+ return -ENODATA;
+ return ea_init(ip, type, name, value, size);
+ }
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
- if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+ if (el.el_ea) {
+ if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+ brelse(el.el_bh);
+ return -EPERM;
+ }
- gfs2_glock_dq_uninit(&i_gh);
+ error = -EEXIST;
+ if (!(flags & XATTR_CREATE)) {
+ int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+ error = ea_set_i(ip, type, name, value, size, &el);
+ if (!error && unstuffed)
+ ea_set_remove_unstuffed(ip, &el);
+ }
+
+ brelse(el.el_bh);
+ return error;
+ }
+
+ error = -ENODATA;
+ if (!(flags & XATTR_REPLACE))
+ error = ea_set_i(ip, type, name, value, size, NULL);
return error;
}
@@ -1267,22 +1289,26 @@ fail:
return error;
}
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data)
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
+ struct gfs2_ea_location el;
struct buffer_head *dibh;
int error;
- if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+ if (error)
+ return error;
+
+ if (GFS2_EA_IS_STUFFED(el.el_ea)) {
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
- memcpy(GFS2_EA2DATA(el->el_ea), data,
- GFS2_EA_DATA_LEN(el->el_ea));
+ gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+ memcpy(GFS2_EA2DATA(el.el_ea), data,
+ GFS2_EA_DATA_LEN(el.el_ea));
} else
- error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+ error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
if (error)
return error;
@@ -1503,3 +1529,46 @@ out_alloc:
return error;
}
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = gfs2_xattr_user_get,
+ .set = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = gfs2_xattr_security_get,
+ .set = gfs2_xattr_security_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+ &gfs2_xattr_user_handler,
+ &gfs2_xattr_security_handler,
+ &gfs2_xattr_system_handler,
+ NULL,
+};
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..8d6ae5813c4d 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
#define GFS2_EA_SIZE(ea) \
ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
- (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+ (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
#define GFS2_EAREQ_SIZE_STUFFED(er) \
ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
- sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
-
#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
#define GFS2_EA_BH2FIRST(bh) \
((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-#define GFS2_ERF_MODE 0x80000000
-
struct gfs2_ea_request {
const char *er_name;
char *er_data;
unsigned int er_name_len;
unsigned int er_data_len;
unsigned int er_type; /* GFS2_EATYPE_... */
- int er_flags;
- mode_t er_mode;
};
struct gfs2_ea_location {
@@ -61,40 +53,16 @@ struct gfs2_ea_location {
struct gfs2_ea_header *el_prev;
};
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-int gfs2_ea_find(struct gfs2_inode *ip,
- struct gfs2_ea_request *er,
- struct gfs2_ea_location *el);
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
- struct gfs2_ea_location *el,
- char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data);
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
- switch (ea->ea_type) {
- case GFS2_EATYPE_USR:
- return 5 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SYS:
- return 7 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SECURITY:
- return 9 + ea->ea_name_len + 1;
- default:
- return 0;
- }
-}
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9b9d6395bad3..052f214ea6f0 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
}
unlock_new_inode(tree->inode);
+ if (!HFS_I(tree->inode)->first_blocks) {
+ printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
+ goto free_inode;
+ }
+
mapping = tree->inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 7b6165f25fbe..8bbe03c3f6d5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
brelse(HFS_SB(sb)->mdb_bh);
brelse(HFS_SB(sb)->alt_mdb_bh);
- if (HFS_SB(sb)->nls_io)
- unload_nls(HFS_SB(sb)->nls_io);
- if (HFS_SB(sb)->nls_disk)
- unload_nls(HFS_SB(sb)->nls_disk);
+ unload_nls(HFS_SB(sb)->nls_io);
+ unload_nls(HFS_SB(sb)->nls_disk);
free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
kfree(HFS_SB(sb));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c0759fe0855b..43022f3d5148 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
iput(HFSPLUS_SB(sb).alloc_file);
iput(HFSPLUS_SB(sb).hidden_dir);
brelse(HFSPLUS_SB(sb).s_vhbh);
- if (HFSPLUS_SB(sb).nls)
- unload_nls(HFSPLUS_SB(sb).nls);
+ unload_nls(HFSPLUS_SB(sb).nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
@@ -464,8 +463,7 @@ out:
cleanup:
hfsplus_put_super(sb);
- if (nls)
- unload_nls(nls);
+ unload_nls(nls);
return err;
}
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 175d08eacc86..bed78ac8f6d1 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (hfsplus_get_last_session(sb, &part_start, &part_size))
return -EINVAL;
+ if ((u64)part_start + part_size > 0x100000000ULL) {
+ pr_err("hfs: volumes larger than 2TB are not supported yet\n");
+ return -EINVAL;
+ }
while (1) {
bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
if (!bh)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..87a1258953b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,12 +31,10 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/ima.h>
+#include <linux/magic.h>
#include <asm/uaccess.h>
-/* some random number */
-#define HUGETLBFS_MAGIC 0x958458f6
-
static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
@@ -44,6 +42,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
static struct backing_dev_info hugetlbfs_backing_dev_info = {
+ .name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
@@ -381,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
{
- struct super_block *sb = inode->i_sb;
-
- if (!hlist_unhashed(&inode->i_hash)) {
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_unused);
- inodes_stat.nr_unused++;
- if (!sb || (sb->s_flags & MS_ACTIVE)) {
- spin_unlock(&inode_lock);
- return;
- }
- inode->i_state |= I_WILL_FREE;
- spin_unlock(&inode_lock);
- /*
- * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
- * in our backing_dev_info.
- */
- write_inode_now(inode, 1);
- spin_lock(&inode_lock);
- inode->i_state &= ~I_WILL_FREE;
- inodes_stat.nr_unused--;
- hlist_del_init(&inode->i_hash);
+ if (generic_detach_inode(inode)) {
+ truncate_hugepages(inode, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
}
- list_del_init(&inode->i_list);
- list_del_init(&inode->i_sb_list);
- inode->i_state |= I_FREEING;
- inodes_stat.nr_inodes--;
- spin_unlock(&inode_lock);
- truncate_hugepages(inode, 0);
- clear_inode(inode);
- destroy_inode(inode);
}
static void hugetlbfs_drop_inode(struct inode *inode)
@@ -506,6 +480,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
+ /*
+ * The policy is initialized here even if we are creating a
+ * private inode because initialization simply creates an
+ * an empty rb tree and calls spin_lock_init(), later when we
+ * call mpol_free_shared_policy() it will just return because
+ * the rb tree will still be empty.
+ */
mpol_shared_policy_init(&info->policy, NULL);
switch (mode & S_IFMT) {
default:
@@ -936,7 +917,7 @@ static int can_do_hugetlb_shm(void)
}
struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
- struct user_struct **user)
+ struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
struct file *file;
@@ -948,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
- if (!can_do_hugetlb_shm()) {
+ if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..4d8e3be55976 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
+#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
@@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly;
DEFINE_SPINLOCK(inode_lock);
/*
- * iprune_mutex provides exclusion between the kswapd or try_to_free_pages
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
* icache shrinking path, and the umount path. Without this exclusion,
* by the time prune_icache calls iput for the inode whose pages it has
* been invalidating, or by the time it calls clear_inode & destroy_inode
* from its final dispose_list, the struct super_block they refer to
* (for inode->i_sb->s_op) may already have been freed and reused.
+ *
+ * We make this an rwsem because the fastpath is icache shrinking. In
+ * some cases a filesystem may be doing a significant amount of work in
+ * its inode reclaim code, so this should improve parallelism.
*/
-static DEFINE_MUTEX(iprune_mutex);
+static DECLARE_RWSEM(iprune_sem);
/*
* Statistics gathering..
@@ -123,7 +128,7 @@ static void wake_up_inode(struct inode *inode)
int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct address_space_operations empty_aops;
- static struct inode_operations empty_iops;
+ static const struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct address_space *const mapping = &inode->i_data;
@@ -182,9 +187,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (sb->s_bdev) {
struct backing_dev_info *bdi;
- bdi = sb->s_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
inode->i_private = NULL;
@@ -383,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
/*
* We can reschedule here without worrying about the list's
* consistency because the per-sb list of inodes must not
- * change during umount anymore, and because iprune_mutex keeps
+ * change during umount anymore, and because iprune_sem keeps
* shrink_icache_memory() away.
*/
cond_resched_lock(&inode_lock);
@@ -422,7 +425,7 @@ int invalidate_inodes(struct super_block *sb)
int busy;
LIST_HEAD(throw_away);
- mutex_lock(&iprune_mutex);
+ down_write(&iprune_sem);
spin_lock(&inode_lock);
inotify_unmount_inodes(&sb->s_inodes);
fsnotify_unmount_inodes(&sb->s_inodes);
@@ -430,7 +433,7 @@ int invalidate_inodes(struct super_block *sb)
spin_unlock(&inode_lock);
dispose_list(&throw_away);
- mutex_unlock(&iprune_mutex);
+ up_write(&iprune_sem);
return busy;
}
@@ -469,7 +472,7 @@ static void prune_icache(int nr_to_scan)
int nr_scanned;
unsigned long reap = 0;
- mutex_lock(&iprune_mutex);
+ down_read(&iprune_sem);
spin_lock(&inode_lock);
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
@@ -511,7 +514,7 @@ static void prune_icache(int nr_to_scan)
spin_unlock(&inode_lock);
dispose_list(&freeable);
- mutex_unlock(&iprune_mutex);
+ up_read(&iprune_sem);
}
/*
@@ -697,13 +700,15 @@ void unlock_new_inode(struct inode *inode)
}
#endif
/*
- * This is special! We do not need the spinlock
- * when clearing I_LOCK, because we're guaranteed
- * that nobody else tries to do anything about the
- * state of the inode when it is locked, as we
- * just created it (so there can be no old holders
- * that haven't tested I_LOCK).
+ * This is special! We do not need the spinlock when clearing I_LOCK,
+ * because we're guaranteed that nobody else tries to do anything about
+ * the state of the inode when it is locked, as we just created it (so
+ * there can be no old holders that haven't tested I_LOCK).
+ * However we must emit the memory barrier so that other CPUs reliably
+ * see the clearing of I_LOCK after the other inode initialisation has
+ * completed.
*/
+ smp_mb();
WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
inode->i_state &= ~(I_LOCK|I_NEW);
wake_up_inode(inode);
@@ -1236,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
}
EXPORT_SYMBOL(generic_delete_inode);
-static void generic_forget_inode(struct inode *inode)
+/**
+ * generic_detach_inode - remove inode from inode lists
+ * @inode: inode to remove
+ *
+ * Remove inode from inode lists, write it if it's dirty. This is just an
+ * internal VFS helper exported for hugetlbfs. Do not use!
+ *
+ * Returns 1 if inode should be completely destroyed.
+ */
+int generic_detach_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -1246,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
inodes_stat.nr_unused++;
if (sb->s_flags & MS_ACTIVE) {
spin_unlock(&inode_lock);
- return;
+ return 0;
}
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_WILL_FREE;
@@ -1264,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
inode->i_state |= I_FREEING;
inodes_stat.nr_inodes--;
spin_unlock(&inode_lock);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(generic_detach_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ if (!generic_detach_inode(inode))
+ return;
if (inode->i_data.nrpages)
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
@@ -1394,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (mnt_want_write(mnt))
- return;
if (inode->i_flags & S_NOATIME)
- goto out;
+ return;
if (IS_NOATIME(inode))
- goto out;
+ return;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- goto out;
+ return;
if (mnt->mnt_flags & MNT_NOATIME)
- goto out;
+ return;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- goto out;
+ return;
now = current_fs_time(inode->i_sb);
if (!relatime_need_update(mnt, inode, now))
- goto out;
+ return;
if (timespec_equal(&inode->i_atime, &now))
- goto out;
+ return;
+
+ if (mnt_want_write(mnt))
+ return;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
-out:
mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
@@ -1439,34 +1461,37 @@ void file_update_time(struct file *file)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
- int sync_it = 0;
- int err;
+ enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+ /* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return;
- err = mnt_want_write_file(file);
- if (err)
- return;
-
now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_mtime, &now)) {
- inode->i_mtime = now;
- sync_it = 1;
- }
+ if (!timespec_equal(&inode->i_mtime, &now))
+ sync_it = S_MTIME;
- if (!timespec_equal(&inode->i_ctime, &now)) {
- inode->i_ctime = now;
- sync_it = 1;
- }
+ if (!timespec_equal(&inode->i_ctime, &now))
+ sync_it |= S_CTIME;
- if (IS_I_VERSION(inode)) {
- inode_inc_iversion(inode);
- sync_it = 1;
- }
+ if (IS_I_VERSION(inode))
+ sync_it |= S_VERSION;
+
+ if (!sync_it)
+ return;
- if (sync_it)
- mark_inode_dirty_sync(inode);
+ /* Finally allowed to write? Takes lock. */
+ if (mnt_want_write_file(file))
+ return;
+
+ /* Only change inode inside the lock region */
+ if (sync_it & S_VERSION)
+ inode_inc_iversion(inode);
+ if (sync_it & S_CTIME)
+ inode->i_ctime = now;
+ if (sync_it & S_MTIME)
+ inode->i_mtime = now;
+ mark_inode_dirty_sync(inode);
mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(file_update_time);
@@ -1594,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
else if (S_ISSOCK(mode))
inode->i_fop = &bad_sock_fops;
else
- printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
- mode);
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+ " inode %s:%lu\n", mode, inode->i_sb->s_id,
+ inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
diff --git a/fs/internal.h b/fs/internal.h
index d55ef562f0bb..515175b8b72e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
* namespace.c
*/
extern int copy_mount_options(const void __user *, unsigned long *);
+extern int copy_mount_string(const void __user *, char **);
extern void free_vfsmnt(struct vfsmount *);
extern struct vfsmount *alloc_vfsmnt(const char *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5612880fcbe7..6c751106c2e5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
static int fiemap_check_ranges(struct super_block *sb,
u64 start, u64 len, u64 *new_len)
{
+ u64 maxbytes = (u64) sb->s_maxbytes;
+
*new_len = len;
if (len == 0)
return -EINVAL;
- if (start > sb->s_maxbytes)
+ if (start > maxbytes)
return -EFBIG;
/*
* Shrink request scope to what the fs can actually handle.
*/
- if ((len > sb->s_maxbytes) ||
- (sb->s_maxbytes - len) < start)
- *new_len = sb->s_maxbytes - start;
+ if (len > maxbytes || (maxbytes - len) < start)
+ *new_len = maxbytes - start;
return 0;
}
@@ -253,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode,
u64 len, get_block_t *get_block)
{
struct buffer_head tmp;
- unsigned int start_blk;
+ unsigned long long start_blk;
long long length = 0, map_len = 0;
u64 logical = 0, phys = 0, size = 0;
u32 flags = FIEMAP_EXTENT_MERGED;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index defb932eee9a..0b3fa7974fa8 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace;
static DEFINE_MUTEX(zisofs_zlib_lock);
/*
- * When decompressing, we typically obtain more than one page
- * per reference. We inject the additional pages into the page
- * cache as a form of readahead.
+ * Read data of @inode from @block_start to @block_end and uncompress
+ * to one zisofs block. Store the data in the @pages array with @pcount
+ * entries. Start storing at offset @poffset of the first page.
*/
-static int zisofs_readpage(struct file *file, struct page *page)
+static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
+ loff_t block_end, int pcount,
+ struct page **pages, unsigned poffset,
+ int *errp)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
- unsigned int maxpage, xpage, fpage, blockindex;
- unsigned long offset;
- unsigned long blockptr, blockendptr, cstart, cend, csize;
- struct buffer_head *bh, *ptrbh[2];
- unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
- unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
- unsigned long bufmask = bufsize - 1;
- int err = -EIO;
- int i;
- unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
- /* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */
- unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT;
- unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift;
- unsigned long zisofs_block_page_mask = zisofs_block_pages-1;
- struct page *pages[zisofs_block_pages];
- unsigned long index = page->index;
- int indexblocks;
-
- /* We have already been given one page, this is the one
- we must do. */
- xpage = index & zisofs_block_page_mask;
- pages[xpage] = page;
-
- /* The remaining pages need to be allocated and inserted */
- offset = index & ~zisofs_block_page_mask;
- blockindex = offset >> zisofs_block_page_shift;
- maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
- /*
- * If this page is wholly outside i_size we just return zero;
- * do_generic_file_read() will handle this for us
- */
- if (page->index >= maxpage) {
- SetPageUptodate(page);
- unlock_page(page);
+ unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
+ unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
+ unsigned int bufmask = bufsize - 1;
+ int i, block_size = block_end - block_start;
+ z_stream stream = { .total_out = 0,
+ .avail_in = 0,
+ .avail_out = 0, };
+ int zerr;
+ int needblocks = (block_size + (block_start & bufmask) + bufmask)
+ >> bufshift;
+ int haveblocks;
+ blkcnt_t blocknum;
+ struct buffer_head *bhs[needblocks + 1];
+ int curbh, curpage;
+
+ if (block_size > deflateBound(1UL << zisofs_block_shift)) {
+ *errp = -EIO;
return 0;
}
-
- maxpage = min(zisofs_block_pages, maxpage-offset);
-
- for ( i = 0 ; i < maxpage ; i++, offset++ ) {
- if ( i != xpage ) {
- pages[i] = grab_cache_page_nowait(mapping, offset);
- }
- page = pages[i];
- if ( page ) {
- ClearPageError(page);
- kmap(page);
+ /* Empty block? */
+ if (block_size == 0) {
+ for ( i = 0 ; i < pcount ; i++ ) {
+ if (!pages[i])
+ continue;
+ memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+ flush_dcache_page(pages[i]);
+ SetPageUptodate(pages[i]);
}
+ return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
}
- /* This is the last page filled, plus one; used in case of abort. */
- fpage = 0;
+ /* Because zlib is not thread-safe, do all the I/O at the top. */
+ blocknum = block_start >> bufshift;
+ memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+ haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
+ ll_rw_block(READ, haveblocks, bhs);
- /* Find the pointer to this specific chunk */
- /* Note: we're not using isonum_731() here because the data is known aligned */
- /* Note: header_size is in 32-bit words (4 bytes) */
- blockptr = (header_size + blockindex) << 2;
- blockendptr = blockptr + 4;
+ curbh = 0;
+ curpage = 0;
+ /*
+ * First block is special since it may be fractional. We also wait for
+ * it before grabbing the zlib mutex; odds are that the subsequent
+ * blocks are going to come in in short order so we don't hold the zlib
+ * mutex longer than necessary.
+ */
- indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1;
- ptrbh[0] = ptrbh[1] = NULL;
+ if (!bhs[0])
+ goto b_eio;
- if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) {
- if ( ptrbh[0] ) brelse(ptrbh[0]);
- printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n",
- inode->i_ino, blockptr >> bufshift);
- goto eio;
- }
- ll_rw_block(READ, indexblocks, ptrbh);
-
- bh = ptrbh[0];
- if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
- printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
- inode->i_ino, blockptr >> bufshift);
- if ( ptrbh[1] )
- brelse(ptrbh[1]);
- goto eio;
- }
- cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask)));
-
- if ( indexblocks == 2 ) {
- /* We just crossed a block boundary. Switch to the next block */
- brelse(bh);
- bh = ptrbh[1];
- if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
- printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
- inode->i_ino, blockendptr >> bufshift);
- goto eio;
- }
+ wait_on_buffer(bhs[0]);
+ if (!buffer_uptodate(bhs[0])) {
+ *errp = -EIO;
+ goto b_eio;
}
- cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask)));
- brelse(bh);
- if (cstart > cend)
- goto eio;
+ stream.workspace = zisofs_zlib_workspace;
+ mutex_lock(&zisofs_zlib_lock);
- csize = cend-cstart;
-
- if (csize > deflateBound(1UL << zisofs_block_shift))
- goto eio;
-
- /* Now page[] contains an array of pages, any of which can be NULL,
- and the locks on which we hold. We should now read the data and
- release the pages. If the pages are NULL the decompressed data
- for that particular page should be discarded. */
-
- if ( csize == 0 ) {
- /* This data block is empty. */
-
- for ( fpage = 0 ; fpage < maxpage ; fpage++ ) {
- if ( (page = pages[fpage]) != NULL ) {
- memset(page_address(page), 0, PAGE_CACHE_SIZE);
-
- flush_dcache_page(page);
- SetPageUptodate(page);
- kunmap(page);
- unlock_page(page);
- if ( fpage == xpage )
- err = 0; /* The critical page */
- else
- page_cache_release(page);
+ zerr = zlib_inflateInit(&stream);
+ if (zerr != Z_OK) {
+ if (zerr == Z_MEM_ERROR)
+ *errp = -ENOMEM;
+ else
+ *errp = -EIO;
+ printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
+ zerr);
+ goto z_eio;
+ }
+
+ while (curpage < pcount && curbh < haveblocks &&
+ zerr != Z_STREAM_END) {
+ if (!stream.avail_out) {
+ if (pages[curpage]) {
+ stream.next_out = page_address(pages[curpage])
+ + poffset;
+ stream.avail_out = PAGE_CACHE_SIZE - poffset;
+ poffset = 0;
+ } else {
+ stream.next_out = (void *)&zisofs_sink_page;
+ stream.avail_out = PAGE_CACHE_SIZE;
}
}
- } else {
- /* This data block is compressed. */
- z_stream stream;
- int bail = 0, left_out = -1;
- int zerr;
- int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift;
- int haveblocks;
- struct buffer_head *bhs[needblocks+1];
- struct buffer_head **bhptr;
-
- /* Because zlib is not thread-safe, do all the I/O at the top. */
-
- blockptr = cstart >> bufshift;
- memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *));
- haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks);
- ll_rw_block(READ, haveblocks, bhs);
-
- bhptr = &bhs[0];
- bh = *bhptr++;
-
- /* First block is special since it may be fractional.
- We also wait for it before grabbing the zlib
- mutex; odds are that the subsequent blocks are
- going to come in in short order so we don't hold
- the zlib mutex longer than necessary. */
-
- if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
- printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
- fpage, xpage, csize);
- goto b_eio;
- }
- stream.next_in = bh->b_data + (cstart & bufmask);
- stream.avail_in = min(bufsize-(cstart & bufmask), csize);
- csize -= stream.avail_in;
-
- stream.workspace = zisofs_zlib_workspace;
- mutex_lock(&zisofs_zlib_lock);
-
- zerr = zlib_inflateInit(&stream);
- if ( zerr != Z_OK ) {
- if ( err && zerr == Z_MEM_ERROR )
- err = -ENOMEM;
- printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
- zerr);
- goto z_eio;
+ if (!stream.avail_in) {
+ wait_on_buffer(bhs[curbh]);
+ if (!buffer_uptodate(bhs[curbh])) {
+ *errp = -EIO;
+ break;
+ }
+ stream.next_in = bhs[curbh]->b_data +
+ (block_start & bufmask);
+ stream.avail_in = min_t(unsigned, bufsize -
+ (block_start & bufmask),
+ block_size);
+ block_size -= stream.avail_in;
+ block_start = 0;
}
- while ( !bail && fpage < maxpage ) {
- page = pages[fpage];
- if ( page )
- stream.next_out = page_address(page);
- else
- stream.next_out = (void *)&zisofs_sink_page;
- stream.avail_out = PAGE_CACHE_SIZE;
-
- while ( stream.avail_out ) {
- int ao, ai;
- if ( stream.avail_in == 0 && left_out ) {
- if ( !csize ) {
- printk(KERN_WARNING "zisofs: ZF read beyond end of input\n");
- bail = 1;
- break;
- } else {
- bh = *bhptr++;
- if ( !bh ||
- (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
- /* Reached an EIO */
- printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
- fpage, xpage, csize);
-
- bail = 1;
- break;
- }
- stream.next_in = bh->b_data;
- stream.avail_in = min(csize,bufsize);
- csize -= stream.avail_in;
- }
- }
- ao = stream.avail_out; ai = stream.avail_in;
- zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
- left_out = stream.avail_out;
- if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 )
- continue;
- if ( zerr != Z_OK ) {
- /* EOF, error, or trying to read beyond end of input */
- if ( err && zerr == Z_MEM_ERROR )
- err = -ENOMEM;
- if ( zerr != Z_STREAM_END )
- printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n",
- zerr, inode->i_ino, index,
- fpage, xpage,
- stream.avail_in, stream.avail_out,
- ai, ao);
- bail = 1;
- break;
+ while (stream.avail_out && stream.avail_in) {
+ zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
+ if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
+ break;
+ if (zerr == Z_STREAM_END)
+ break;
+ if (zerr != Z_OK) {
+ /* EOF, error, or trying to read beyond end of input */
+ if (zerr == Z_MEM_ERROR)
+ *errp = -ENOMEM;
+ else {
+ printk(KERN_DEBUG
+ "zisofs: zisofs_inflate returned"
+ " %d, inode = %lu,"
+ " page idx = %d, bh idx = %d,"
+ " avail_in = %d,"
+ " avail_out = %d\n",
+ zerr, inode->i_ino, curpage,
+ curbh, stream.avail_in,
+ stream.avail_out);
+ *errp = -EIO;
}
+ goto inflate_out;
}
+ }
- if ( stream.avail_out && zerr == Z_STREAM_END ) {
- /* Fractional page written before EOF. This may
- be the last page in the file. */
- memset(stream.next_out, 0, stream.avail_out);
- stream.avail_out = 0;
+ if (!stream.avail_out) {
+ /* This page completed */
+ if (pages[curpage]) {
+ flush_dcache_page(pages[curpage]);
+ SetPageUptodate(pages[curpage]);
}
+ curpage++;
+ }
+ if (!stream.avail_in)
+ curbh++;
+ }
+inflate_out:
+ zlib_inflateEnd(&stream);
- if ( !stream.avail_out ) {
- /* This page completed */
- if ( page ) {
- flush_dcache_page(page);
- SetPageUptodate(page);
- kunmap(page);
- unlock_page(page);
- if ( fpage == xpage )
- err = 0; /* The critical page */
- else
- page_cache_release(page);
- }
- fpage++;
- }
+z_eio:
+ mutex_unlock(&zisofs_zlib_lock);
+
+b_eio:
+ for (i = 0; i < haveblocks; i++)
+ brelse(bhs[i]);
+ return stream.total_out;
+}
+
+/*
+ * Uncompress data so that pages[full_page] is fully uptodate and possibly
+ * fills in other pages if we have data for them.
+ */
+static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
+ struct page **pages)
+{
+ loff_t start_off, end_off;
+ loff_t block_start, block_end;
+ unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
+ unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+ unsigned int blockptr;
+ loff_t poffset = 0;
+ blkcnt_t cstart_block, cend_block;
+ struct buffer_head *bh;
+ unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
+ unsigned int blksize = 1 << blkbits;
+ int err;
+ loff_t ret;
+
+ BUG_ON(!pages[full_page]);
+
+ /*
+ * We want to read at least 'full_page' page. Because we have to
+ * uncompress the whole compression block anyway, fill the surrounding
+ * pages with the data we have anyway...
+ */
+ start_off = page_offset(pages[full_page]);
+ end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+
+ cstart_block = start_off >> zisofs_block_shift;
+ cend_block = (end_off + (1 << zisofs_block_shift) - 1)
+ >> zisofs_block_shift;
+
+ WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
+ ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+
+ /* Find the pointer to this specific chunk */
+ /* Note: we're not using isonum_731() here because the data is known aligned */
+ /* Note: header_size is in 32-bit words (4 bytes) */
+ blockptr = (header_size + cstart_block) << 2;
+ bh = isofs_bread(inode, blockptr >> blkbits);
+ if (!bh)
+ return -EIO;
+ block_start = le32_to_cpu(*(__le32 *)
+ (bh->b_data + (blockptr & (blksize - 1))));
+
+ while (cstart_block < cend_block && pcount > 0) {
+ /* Load end of the compressed block in the file */
+ blockptr += 4;
+ /* Traversed to next block? */
+ if (!(blockptr & (blksize - 1))) {
+ brelse(bh);
+
+ bh = isofs_bread(inode, blockptr >> blkbits);
+ if (!bh)
+ return -EIO;
+ }
+ block_end = le32_to_cpu(*(__le32 *)
+ (bh->b_data + (blockptr & (blksize - 1))));
+ if (block_start > block_end) {
+ brelse(bh);
+ return -EIO;
+ }
+ err = 0;
+ ret = zisofs_uncompress_block(inode, block_start, block_end,
+ pcount, pages, poffset, &err);
+ poffset += ret;
+ pages += poffset >> PAGE_CACHE_SHIFT;
+ pcount -= poffset >> PAGE_CACHE_SHIFT;
+ full_page -= poffset >> PAGE_CACHE_SHIFT;
+ poffset &= ~PAGE_CACHE_MASK;
+
+ if (err) {
+ brelse(bh);
+ /*
+ * Did we finish reading the page we really wanted
+ * to read?
+ */
+ if (full_page < 0)
+ return 0;
+ return err;
}
- zlib_inflateEnd(&stream);
- z_eio:
- mutex_unlock(&zisofs_zlib_lock);
+ block_start = block_end;
+ cstart_block++;
+ }
+
+ if (poffset && *pages) {
+ memset(page_address(*pages) + poffset, 0,
+ PAGE_CACHE_SIZE - poffset);
+ flush_dcache_page(*pages);
+ SetPageUptodate(*pages);
+ }
+ return 0;
+}
- b_eio:
- for ( i = 0 ; i < haveblocks ; i++ ) {
- if ( bhs[i] )
- brelse(bhs[i]);
+/*
+ * When decompressing, we typically obtain more than one page
+ * per reference. We inject the additional pages into the page
+ * cache as a form of readahead.
+ */
+static int zisofs_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ int err;
+ int i, pcount, full_page;
+ unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+ unsigned int zisofs_pages_per_cblock =
+ PAGE_CACHE_SHIFT <= zisofs_block_shift ?
+ (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+ struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+ pgoff_t index = page->index, end_index;
+
+ end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ /*
+ * If this page is wholly outside i_size we just return zero;
+ * do_generic_file_read() will handle this for us
+ */
+ if (index >= end_index) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+ /* We have already been given one page, this is the one
+ we must do. */
+ full_page = index & (zisofs_pages_per_cblock - 1);
+ pcount = min_t(int, zisofs_pages_per_cblock,
+ end_index - (index & ~(zisofs_pages_per_cblock - 1)));
+ index -= full_page;
+ } else {
+ full_page = 0;
+ pcount = 1;
+ }
+ pages[full_page] = page;
+
+ for (i = 0; i < pcount; i++, index++) {
+ if (i != full_page)
+ pages[i] = grab_cache_page_nowait(mapping, index);
+ if (pages[i]) {
+ ClearPageError(pages[i]);
+ kmap(pages[i]);
}
}
-eio:
+ err = zisofs_fill_pages(inode, full_page, pcount, pages);
/* Release any residual pages, do not SetPageUptodate */
- while ( fpage < maxpage ) {
- page = pages[fpage];
- if ( page ) {
- flush_dcache_page(page);
- if ( fpage == xpage )
- SetPageError(page);
- kunmap(page);
- unlock_page(page);
- if ( fpage != xpage )
- page_cache_release(page);
+ for (i = 0; i < pcount; i++) {
+ if (pages[i]) {
+ flush_dcache_page(pages[i]);
+ if (i == full_page && err)
+ SetPageError(pages[i]);
+ kunmap(pages[i]);
+ unlock_page(pages[i]);
+ if (i != full_page)
+ page_cache_release(pages[i]);
}
- fpage++;
}
/* At this point, err contains 0 or -EIO depending on the "critical" page */
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index e81a30593ba9..ed752cb38474 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -9,7 +9,7 @@
*
* The following files are helpful:
*
- * Documentation/filesystems/Exporting
+ * Documentation/filesystems/nfs/Exporting
* fs/exportfs/expfs.c.
*/
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 85f96bc651c7..6b4dcd4f2943 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
#ifdef CONFIG_JOLIET
lock_kernel();
- if (sbi->s_nls_iocharset) {
- unload_nls(sbi->s_nls_iocharset);
- sbi->s_nls_iocharset = NULL;
- }
+ unload_nls(sbi->s_nls_iocharset);
unlock_kernel();
#endif
@@ -912,8 +909,7 @@ out_no_root:
printk(KERN_WARNING "%s: get root inode failed\n", __func__);
out_no_inode:
#ifdef CONFIG_JOLIET
- if (sbi->s_nls_iocharset)
- unload_nls(sbi->s_nls_iocharset);
+ unload_nls(sbi->s_nls_iocharset);
#endif
goto out_freesbi;
out_no_read:
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c2fb2dd0131f..96a685c550fd 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -518,8 +518,7 @@ repeat:
if (algo == SIG('p', 'z')) {
int block_shift =
isonum_711(&rr->u.ZF.parms[1]);
- if (block_shift < PAGE_CACHE_SHIFT
- || block_shift > 17) {
+ if (block_shift > 17) {
printk(KERN_WARNING "isofs: "
"Can't handle ZF block "
"size of 2^%d\n",
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned int blocknr, freed;
if (is_journal_aborted(journal))
return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
+ "Cleaning journal tail from %d to %d (offset %u), "
+ "freeing %u\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
int bufs;
int flags;
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..4160afad6d00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno);
EXPORT_SYMBOL(journal_ack_err);
EXPORT_SYMBOL(journal_clear_err);
EXPORT_SYMBOL(log_wait_commit);
+EXPORT_SYMBOL(log_start_commit);
EXPORT_SYMBOL(journal_start_commit);
EXPORT_SYMBOL(journal_force_commit_nested);
EXPORT_SYMBOL(journal_wipe);
@@ -276,7 +277,7 @@ static void journal_kill_thread(journal_t *journal)
int journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
- unsigned long blocknr)
+ unsigned int blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
@@ -567,9 +568,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
* Log buffer allocation routines:
*/
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
{
- unsigned long blocknr;
+ unsigned int blocknr;
spin_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);
@@ -590,11 +591,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
* this is a no-op. If needed, we can use j_blk_offset - everything is
* ready.
*/
-int journal_bmap(journal_t *journal, unsigned long blocknr,
- unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+ unsigned int *retp)
{
int err = 0;
- unsigned long ret;
+ unsigned int ret;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +605,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
char b[BDEVNAME_SIZE];
printk(KERN_ALERT "%s: journal block not found "
- "at offset %lu on %s\n",
+ "at offset %u on %s\n",
__func__,
blocknr,
bdevname(journal->j_dev, b));
@@ -630,7 +631,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
err = journal_next_log_block(journal, &blocknr);
@@ -756,6 +757,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
return journal;
out_err:
+ kfree(journal->j_wbuf);
kfree(journal);
return NULL;
}
@@ -774,7 +776,7 @@ journal_t * journal_init_inode (struct inode *inode)
journal_t *journal = journal_init_common();
int err;
int n;
- unsigned long blocknr;
+ unsigned int blocknr;
if (!journal)
return NULL;
@@ -820,6 +822,7 @@ journal_t * journal_init_inode (struct inode *inode)
return journal;
out_err:
+ kfree(journal->j_wbuf);
kfree(journal);
return NULL;
}
@@ -846,12 +849,12 @@ static void journal_fail_superblock (journal_t *journal)
static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
- unsigned long first, last;
+ unsigned int first, last;
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
- printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+ printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
first, last);
journal_fail_superblock(journal);
return -EINVAL;
@@ -885,7 +888,7 @@ static int journal_reset(journal_t *journal)
**/
int journal_create(journal_t *journal)
{
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
journal_superblock_t *sb;
int i, err;
@@ -969,14 +972,14 @@ void journal_update_superblock(journal_t *journal, int wait)
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %ld, seq %d, errno %d)\n",
+ "(start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
goto out;
}
spin_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1374,7 @@ int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
+ unsigned int old_tail;
spin_lock(&journal->j_state_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned int next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ jbd_debug(3, "JBD: checking block %u\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
- unsigned long io_block;
+ unsigned int io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
- "block %ld in log\n",
+ "block %u in log\n",
err, io_block);
} else {
- unsigned long blocknr;
+ unsigned int blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
max = be32_to_cpu(header->r_count);
while (offset < max) {
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
- unsigned long blocknr;
+ unsigned int blocknr;
};
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
tid_t seq)
{
struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
- unsigned long blocknr)
+ unsigned int blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
* by one.
*/
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
}
}
- jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
*/
int journal_set_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
*/
int journal_test_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires =
+ round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
-
out:
return handle;
}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
+ struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
+ stats = &transaction->t_chp_stats;
+ if (stats->cs_chp_time)
+ stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
+ jiffies);
+ trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
+ transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..c5edc13ccdd0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ !JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
set_buffer_ordered(bh);
barrier_done = 1;
}
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
.nr_to_write = mapping->nrpages * 2,
.range_start = 0,
.range_end = i_size_read(mapping->host),
- .for_writepages = 1,
};
ret = generic_writepages(mapping, &wbc);
@@ -286,7 +286,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
if (err) {
/*
* Because AS_EIO is cleared by
- * wait_on_page_writeback_range(), set it again so
+ * filemap_fdatawait_range(), set it again so
* that user process can get -EIO from fsync().
*/
set_bit(AS_EIO,
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (commit_transaction->t_synchronous_commit)
write_op = WRITE_SYNC_PLUG;
trace_jbd2_commit_locking(journal, commit_transaction);
- stats.u.run.rs_wait = commit_transaction->t_max_wait;
- stats.u.run.rs_locked = jiffies;
- stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
- stats.u.run.rs_locked);
+ stats.run.rs_wait = commit_transaction->t_max_wait;
+ stats.run.rs_locked = jiffies;
+ stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+ stats.run.rs_locked);
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_switch_revoke_table(journal);
trace_jbd2_commit_flushing(journal, commit_transaction);
- stats.u.run.rs_flushing = jiffies;
- stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
- stats.u.run.rs_flushing);
+ stats.run.rs_flushing = jiffies;
+ stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
+ stats.run.rs_flushing);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_unlock(&journal->j_state_lock);
trace_jbd2_commit_logging(journal, commit_transaction);
- stats.u.run.rs_logging = jiffies;
- stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
- stats.u.run.rs_logging);
- stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
- stats.u.run.rs_blocks_logged = 0;
+ stats.run.rs_logging = jiffies;
+ stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
+ stats.run.rs_logging);
+ stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+ stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
submit_bh(write_op, bh);
}
cond_resched();
- stats.u.run.rs_blocks_logged += bufs;
+ stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -707,11 +707,13 @@ start_journal_io:
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
+ if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(journal->j_dev, NULL);
}
/*
@@ -834,7 +836,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -986,33 +988,30 @@ restart_loop:
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_start = jiffies;
- stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
- commit_transaction->t_start);
+ stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
+ commit_transaction->t_start);
/*
- * File the transaction for history
+ * File the transaction statistics
*/
- stats.ts_type = JBD2_STATS_RUN;
stats.ts_tid = commit_transaction->t_tid;
- stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
- spin_lock(&journal->j_history_lock);
- memcpy(journal->j_history + journal->j_history_cur, &stats,
- sizeof(stats));
- if (++journal->j_history_cur == journal->j_history_max)
- journal->j_history_cur = 0;
+ stats.run.rs_handle_count = commit_transaction->t_handle_count;
+ trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
+ commit_transaction->t_tid, &stats.run);
/*
* Calculate overall stats
*/
+ spin_lock(&journal->j_history_lock);
journal->j_stats.ts_tid++;
- journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
- journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
- journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
- journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
- journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
- journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
- journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
- journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
+ journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_running += stats.run.rs_running;
+ journal->j_stats.run.rs_locked += stats.run.rs_locked;
+ journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+ journal->j_stats.run.rs_logging += stats.run.rs_logging;
+ journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+ journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+ journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..af60d98ddd22 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
- printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
- "commit interval %ld seconds\n", current->pid,
- journal->j_devname, journal->j_commit_interval / HZ);
-
/*
* And now, wait forever for commit wakeup events.
*/
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
{
struct task_struct *t;
- t = kthread_run(kjournald2, journal, "kjournald2");
+ t = kthread_run(kjournald2, journal, "jbd2/%s",
+ journal->j_devname);
if (IS_ERR(t))
return PTR_ERR(t);
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
int max;
};
-static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
- struct transaction_stats_s *ts,
- int first)
-{
- if (ts == s->stats + s->max)
- ts = s->stats;
- if (!first && ts == s->stats + s->start)
- return NULL;
- while (ts->ts_type == 0) {
- ts++;
- if (ts == s->stats + s->max)
- ts = s->stats;
- if (ts == s->stats + s->start)
- return NULL;
- }
- return ts;
-
-}
-
-static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct jbd2_stats_proc_session *s = seq->private;
- struct transaction_stats_s *ts;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
- if (!ts)
- return NULL;
- l--;
- while (l) {
- ts = jbd2_history_skip_empty(s, ++ts, 0);
- if (!ts)
- break;
- l--;
- }
- return ts;
-}
-
-static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct jbd2_stats_proc_session *s = seq->private;
- struct transaction_stats_s *ts = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return jbd2_history_skip_empty(s, s->stats + s->start, 1);
- else
- return jbd2_history_skip_empty(s, ++ts, 0);
-}
-
-static int jbd2_seq_history_show(struct seq_file *seq, void *v)
-{
- struct transaction_stats_s *ts = v;
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
- "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
- "wait", "run", "lock", "flush", "log", "hndls",
- "block", "inlog", "ctime", "write", "drop",
- "close");
- return 0;
- }
- if (ts->ts_type == JBD2_STATS_RUN)
- seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
- "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
- jiffies_to_msecs(ts->u.run.rs_wait),
- jiffies_to_msecs(ts->u.run.rs_running),
- jiffies_to_msecs(ts->u.run.rs_locked),
- jiffies_to_msecs(ts->u.run.rs_flushing),
- jiffies_to_msecs(ts->u.run.rs_logging),
- ts->u.run.rs_handle_count,
- ts->u.run.rs_blocks,
- ts->u.run.rs_blocks_logged);
- else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
- seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
- "C", ts->ts_tid, " ",
- jiffies_to_msecs(ts->u.chp.cs_chp_time),
- ts->u.chp.cs_written, ts->u.chp.cs_dropped,
- ts->u.chp.cs_forced_to_close);
- else
- J_ASSERT(0);
- return 0;
-}
-
-static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static struct seq_operations jbd2_seq_history_ops = {
- .start = jbd2_seq_history_start,
- .next = jbd2_seq_history_next,
- .stop = jbd2_seq_history_stop,
- .show = jbd2_seq_history_show,
-};
-
-static int jbd2_seq_history_open(struct inode *inode, struct file *file)
-{
- journal_t *journal = PDE(inode)->data;
- struct jbd2_stats_proc_session *s;
- int rc, size;
-
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- size = sizeof(struct transaction_stats_s) * journal->j_history_max;
- s->stats = kmalloc(size, GFP_KERNEL);
- if (s->stats == NULL) {
- kfree(s);
- return -ENOMEM;
- }
- spin_lock(&journal->j_history_lock);
- memcpy(s->stats, journal->j_history, size);
- s->max = journal->j_history_max;
- s->start = journal->j_history_cur % s->max;
- spin_unlock(&journal->j_history_lock);
-
- rc = seq_open(file, &jbd2_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = file->private_data;
- m->private = s;
- } else {
- kfree(s->stats);
- kfree(s);
- }
- return rc;
-
-}
-
-static int jbd2_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = file->private_data;
- struct jbd2_stats_proc_session *s = seq->private;
-
- kfree(s->stats);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static struct file_operations jbd2_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = jbd2_seq_history_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = jbd2_seq_history_release,
-};
-
static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
- seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+ seq_printf(seq, "%lu transaction, each up to %u blocks\n",
s->stats->ts_tid,
s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
seq_printf(seq, " %ums running transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
- jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
seq_printf(seq, " %ums flushing data (in ordered mode)\n",
- jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
seq_printf(seq, " %ums logging transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
seq_printf(seq, " %lluus average transaction commit time\n",
div_u64(s->journal->j_average_commit_time, 1000));
seq_printf(seq, " %lu handles per transaction\n",
- s->stats->u.run.rs_handle_count / s->stats->ts_tid);
+ s->stats->run.rs_handle_count / s->stats->ts_tid);
seq_printf(seq, " %lu blocks per transaction\n",
- s->stats->u.run.rs_blocks / s->stats->ts_tid);
+ s->stats->run.rs_blocks / s->stats->ts_tid);
seq_printf(seq, " %lu logged blocks per transaction\n",
- s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
+ s->stats->run.rs_blocks_logged / s->stats->ts_tid);
return 0;
}
@@ -872,7 +722,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations jbd2_seq_info_ops = {
+static const struct seq_operations jbd2_seq_info_ops = {
.start = jbd2_seq_info_start,
.next = jbd2_seq_info_next,
.stop = jbd2_seq_info_stop,
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static struct file_operations jbd2_seq_info_fops = {
+static const struct file_operations jbd2_seq_info_fops = {
.owner = THIS_MODULE,
.open = jbd2_seq_info_open,
.read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
{
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
- proc_create_data("history", S_IRUGO, journal->j_proc_entry,
- &jbd2_seq_history_fops, journal);
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_info_fops, journal);
}
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
static void jbd2_stats_proc_exit(journal_t *journal)
{
remove_proc_entry("info", journal->j_proc_entry);
- remove_proc_entry("history", journal->j_proc_entry);
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
-static void journal_init_stats(journal_t *journal)
-{
- int size;
-
- if (!proc_jbd2_stats)
- return;
-
- journal->j_history_max = 100;
- size = sizeof(struct transaction_stats_s) * journal->j_history_max;
- journal->j_history = kzalloc(size, GFP_KERNEL);
- if (!journal->j_history) {
- journal->j_history_max = 0;
- return;
- }
- spin_lock_init(&journal->j_history_lock);
-}
-
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
goto fail;
}
- journal_init_stats(journal);
+ spin_lock_init(&journal->j_history_lock);
return journal;
fail:
@@ -1083,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
return journal;
out_err:
+ kfree(journal->j_wbuf);
jbd2_stats_proc_exit(journal);
kfree(journal);
return NULL;
@@ -1115,7 +946,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
while ((p = strchr(p, '/')))
*p = '!';
p = journal->j_devname + strlen(journal->j_devname);
- sprintf(p, ":%lu", journal->j_inode->i_ino);
+ sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
journal, inode->i_sb->s_id, inode->i_ino,
@@ -1156,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
return journal;
out_err:
+ kfree(journal->j_wbuf);
jbd2_stats_proc_exit(journal);
kfree(journal);
return NULL;
@@ -1187,6 +1019,12 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
@@ -1410,6 +1248,13 @@ int jbd2_journal_load(journal_t *journal)
if (jbd2_journal_recover(journal))
goto recovery_error;
+ if (journal->j_failed_commit) {
+ printk(KERN_ERR "JBD2: journal transaction %u on %s "
+ "is corrupt.\n", journal->j_failed_commit,
+ journal->j_devname);
+ return -EIO;
+ }
+
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
__jbd2_log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
__jbd2_log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return rc;
}
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl;
int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jffs2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jffs2_check_acl);
-}
-
int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
{
struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
extern int jffs2_acl_chmod(struct inode *);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
#else
-#define jffs2_permission (NULL)
+#define jffs2_check_acl (NULL)
#define jffs2_acl_chmod(inode) (0)
#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e9580104b6ba..3ff50da94789 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
#include <linux/completion.h>
#include <linux/sched.h>
#include <linux/freezer.h>
+#include <linux/kthread.h>
#include "nodelist.h"
@@ -31,7 +32,7 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
/* This must only ever be called when no GC thread is currently running */
int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
{
- pid_t pid;
+ struct task_struct *tsk;
int ret = 0;
BUG_ON(c->gc_task);
@@ -39,15 +40,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
init_completion(&c->gc_thread_start);
init_completion(&c->gc_thread_exit);
- pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES);
- if (pid < 0) {
- printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %d\n", -pid);
+ tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
+ if (IS_ERR(tsk)) {
+ printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
complete(&c->gc_thread_exit);
- ret = pid;
+ ret = PTR_ERR(tsk);
} else {
/* Wait for it... */
- D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid));
+ D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
wait_for_completion(&c->gc_thread_start);
+ ret = tsk->pid;
}
return ret;
@@ -71,7 +73,6 @@ static int jffs2_garbage_collect_thread(void *_c)
{
struct jffs2_sb_info *c = _c;
- daemonize("jffs2_gcd_mtd%d", c->mtd->index);
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
allow_signal(SIGCONT);
@@ -107,6 +108,11 @@ static int jffs2_garbage_collect_thread(void *_c)
* the GC thread get there first. */
schedule_timeout_interruptible(msecs_to_jiffies(50));
+ if (kthread_should_stop()) {
+ D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n"));
+ goto die;
+ }
+
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 9eff2bdae8a7..c082868910f2 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -39,13 +39,13 @@ int __init jffs2_create_slab_caches(void)
raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
sizeof(struct jffs2_raw_dirent),
- 0, 0, NULL);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!raw_dirent_slab)
goto err;
raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
sizeof(struct jffs2_raw_inode),
- 0, 0, NULL);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!raw_inode_slab)
goto err;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
}
-static struct export_operations jffs2_export_ops = {
+static const struct export_operations jffs2_export_ops = {
.get_parent = jffs2_get_parent,
.fh_to_dentry = jffs2_fh_to_dentry,
.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
.follow_link = jffs2_follow_link,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
return rc;
}
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jfs_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jfs_check_acl);
-}
-
int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
{
struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 37e6dcda8fc8..2234c73fc577 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
rc = jfs_umount(sb);
if (rc)
jfs_err("jfs_umount failed with return code %d", rc);
- if (sbi->nls_tab)
- unload_nls(sbi->nls_tab);
- sbi->nls_tab = NULL;
+
+ unload_nls(sbi->nls_tab);
truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
iput(sbi->direct_inode);
- sbi->direct_inode = NULL;
kfree(sbi);
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
if (nls_map != (void *) -1) {
/* Discard old (if remount) */
- if (sbi->nls_tab)
- unload_nls(sbi->nls_tab);
+ unload_nls(sbi->nls_tab);
sbi->nls_tab = nls_map;
}
return 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcec3d3ea64f..219576c52d80 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
loff_t pos = *ppos;
+ size_t ret;
+
if (pos < 0)
return -EINVAL;
- if (pos >= available)
+ if (pos >= available || !count)
return 0;
if (count > available - pos)
count = available - pos;
- if (copy_to_user(to, from + pos, count))
+ ret = copy_to_user(to, from + pos, count);
+ if (ret == count)
return -EFAULT;
+ count -= ret;
*ppos = pos + count;
return count;
}
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
if (copy_from_user(attr->set_buf, buf, size))
goto out;
- ret = len; /* claim we got the whole input */
attr->set_buf[size] = '\0';
val = simple_strtol(attr->set_buf, NULL, 0);
- attr->set(attr->data, val);
+ ret = attr->set(attr->data, val);
+ if (ret == 0)
+ ret = len; /* on success, claim we got the whole input */
out:
mutex_unlock(&attr->mutex);
return ret;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+ if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
}
-static struct file_lock_operations nlmclnt_lock_ops = {
+static const struct file_lock_operations nlmclnt_lock_ops = {
.fl_copy_lock = nlmclnt_locks_copy_lock,
.fl_release_private = nlmclnt_locks_release_private,
};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
return hash & (NLM_HOST_NRHASH - 1);
}
-static void nlm_clear_port(struct sockaddr *sap)
-{
- switch (sap->sa_family) {
- case AF_INET:
- ((struct sockaddr_in *)sap)->sin_port = 0;
- break;
- case AF_INET6:
- ((struct sockaddr_in6 *)sap)->sin6_port = 0;
- break;
- }
-}
-
/*
* Common host lookup routine for server & client
*/
@@ -123,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
*/
chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
- if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
+ if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
@@ -137,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
if (host->h_server != ni->server)
continue;
if (ni->server &&
- !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
+ !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_addrbuf = nsm->sm_addrbuf;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
- nlm_clear_port(nlm_addr(host));
+ rpc_set_port(nlm_addr(host), 0);
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
host->h_version = ni->version;
host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
return (struct sockaddr *)&nsm->sm_addr;
}
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
- if (ipv6_addr_v4mapped(&sin6->sin6_addr))
- snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
- else if (sin6->sin6_scope_id != 0)
- snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
- sin6->sin6_scope_id);
- else
- snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nsm_display_address(const struct sockaddr *sap,
- char *buf, const size_t len)
-{
- switch (sap->sa_family) {
- case AF_INET:
- nsm_display_ipv4_address(sap, buf, len);
- break;
- case AF_INET6:
- nsm_display_ipv6_address(sap, buf, len);
- break;
- default:
- snprintf(buf, len, "unsupported address family");
- break;
- }
-}
-
static struct rpc_clnt *nsm_create(void)
{
struct sockaddr_in sin = {
@@ -246,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
struct nsm_handle *nsm;
list_for_each_entry(nsm, &nsm_handles, sm_link)
- if (nlm_cmp_addr(nsm_addr(nsm), sap))
+ if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
memcpy(nsm_addr(new), sap, salen);
new->sm_addrlen = salen;
nsm_init_private(new);
- nsm_display_address((const struct sockaddr *)&new->sm_addr,
- new->sm_addrbuf, sizeof(new->sm_addrbuf));
+
+ if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+ sizeof(new->sm_addrbuf)) == 0)
+ (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+ "unsupported address family");
memcpy(new->sm_name, hostname, hostname_len);
new->sm_name[hostname_len] = '\0';
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
}
-struct lock_manager_operations nlmsvc_lock_operations = {
+const struct lock_manager_operations nlmsvc_lock_operations = {
.fl_compare_owner = nlmsvc_same_owner,
.fl_notify = nlmsvc_notify_blocked,
.fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
- return nlm_cmp_addr(nlm_srcaddr(host), datap);
+ return rpc_cmp_addr(nlm_srcaddr(host), datap);
}
/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
#include <linux/types.h>
#include <linux/sched.h>
-#include <linux/utsname.h>
#include <linux/nfs.h>
#include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/sched.h>
-#include <linux/utsname.h>
#include <linux/nfs.h>
#include <linux/sunrpc/xdr.h>
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
return fl->fl_file == try->fl_file;
}
-static struct lock_manager_operations lease_manager_ops = {
+static const struct lock_manager_operations lease_manager_ops = {
.fl_break = lease_break_callback,
.fl_release_private = lease_release_private_callback,
.fl_mylease = lease_mylease_callback,
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file.
*/
if (found)
- cond_resched_bkl();
+ cond_resched();
find_conflict:
for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (can_sleep)
lock->fl_flags |= FL_SLEEP;
- error = security_file_lock(filp, cmd);
+ error = security_file_lock(filp, lock->fl_type);
if (error)
goto out_free;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d407e7a0b6fe..6198731d7fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -308,14 +308,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
struct inode *inode = (struct inode*)mapping->host;
char *kaddr = page_address(page);
loff_t pos = page_offset(page) + (char*)de - kaddr;
- unsigned len = minix_sb(inode->i_sb)->s_dirsize;
+ struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+ unsigned len = sbi->s_dirsize;
int err;
lock_page(page);
err = __minix_write_begin(NULL, mapping, pos, len,
AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
if (err == 0) {
- de->inode = 0;
+ if (sbi->s_version == MINIX_V3)
+ ((minix3_dirent *) de)->inode = 0;
+ else
+ de->inode = 0;
err = dir_commit_chunk(page, pos, len);
} else {
unlock_page(page);
@@ -440,7 +444,10 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
if (err == 0) {
- de->inode = inode->i_ino;
+ if (sbi->s_version == MINIX_V3)
+ ((minix3_dirent *) de)->inode = inode->i_ino;
+ else
+ de->inode = inode->i_ino;
err = dir_commit_chunk(page, pos, sbi->s_dirsize);
} else {
unlock_page(page);
@@ -470,7 +477,14 @@ ino_t minix_inode_by_name(struct dentry *dentry)
ino_t res = 0;
if (de) {
- res = de->inode;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct minix_sb_info *sbi = minix_sb(inode->i_sb);
+
+ if (sbi->s_version == MINIX_V3)
+ res = ((minix3_dirent *) de)->inode;
+ else
+ res = de->inode;
dir_put_page(page);
}
return res;
diff --git a/fs/namei.c b/fs/namei.c
index 1f13751693a5..b83d38f614ff 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
EXPORT_SYMBOL(putname);
#endif
-
-/**
- * generic_permission - check for access rights on a Posix-like filesystem
- * @inode: inode to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl: optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
+/*
+ * This does basic POSIX ACL permission checking
*/
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
int (*check_acl)(struct inode *inode, int mask))
{
umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
int error = check_acl(inode, mask);
- if (error == -EACCES)
- goto check_capabilities;
- else if (error != -EAGAIN)
+ if (error != -EAGAIN)
return error;
}
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
*/
if ((mask & ~mode) == 0)
return 0;
+ return -EACCES;
+}
+
+/**
+ * generic_permission - check for access rights on a Posix-like filesystem
+ * @inode: inode to check access rights for
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl: optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+ int (*check_acl)(struct inode *inode, int mask))
+{
+ int ret;
+
+ /*
+ * Do the basic POSIX ACL permission checks.
+ */
+ ret = acl_permission_check(inode, mask, check_acl);
+ if (ret != -EACCES)
+ return ret;
- check_capabilities:
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
if (inode->i_op->permission)
retval = inode->i_op->permission(inode, mask);
else
- retval = generic_permission(inode, mask, NULL);
+ retval = generic_permission(inode, mask, inode->i_op->check_acl);
if (retval)
return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
*/
static int exec_permission_lite(struct inode *inode)
{
- umode_t mode = inode->i_mode;
-
- if (inode->i_op->permission)
- return -EAGAIN;
-
- if (current_fsuid() == inode->i_uid)
- mode >>= 6;
- else if (in_group_p(inode->i_gid))
- mode >>= 3;
-
- if (mode & MAY_EXEC)
- goto ok;
+ int ret;
- if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
- goto ok;
-
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
+ if (inode->i_op->permission) {
+ ret = inode->i_op->permission(inode, MAY_EXEC);
+ if (!ret)
+ goto ok;
+ return ret;
+ }
+ ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+ if (!ret)
goto ok;
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+ if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
goto ok;
- return -EACCES;
+ return ret;
ok:
return security_inode_permission(inode, MAY_EXEC);
}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
- if (err == -EAGAIN)
- err = inode_permission(nd->path.dentry->d_inode,
- MAY_EXEC);
- if (!err)
- err = ima_path_check(&nd->path, MAY_EXEC,
- IMA_COUNT_UPDATE);
if (err)
break;
@@ -1533,9 +1533,11 @@ int may_open(struct path *path, int acc_mode, int flag)
if (error)
return error;
- error = ima_path_check(path,
- acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+ error = ima_path_check(path, acc_mode ?
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+ ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
IMA_COUNT_UPDATE);
+
if (error)
return error;
/*
@@ -1676,6 +1678,15 @@ struct file *do_filp_open(int dfd, const char *pathname,
int will_write;
int flag = open_to_namei_flags(open_flag);
+ /*
+ * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
+ * check for O_DSYNC if the need any syncing at all we enforce it's
+ * always set instead of having to deal with possibly weird behaviour
+ * for malicious applications setting only __O_SYNC.
+ */
+ if (open_flag & __O_SYNC)
+ open_flag |= O_DSYNC;
+
if (!acc_mode)
acc_mode = MAY_OPEN | ACC_MODE(flag);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b0..bdc3cb4fd222 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
{
struct vfsmount *mnt;
- if (!type || !memchr(type, 0, PAGE_SIZE))
+ if (!type)
return -EINVAL;
/* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
return 0;
}
+int copy_mount_string(const void __user *data, char **where)
+{
+ char *tmp;
+
+ if (!data) {
+ *where = NULL;
+ return 0;
+ }
+
+ tmp = strndup_user(data, PAGE_SIZE);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ *where = tmp;
+ return 0;
+}
+
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
- if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
- return -EINVAL;
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
- int retval;
+ int ret;
+ char *kernel_type;
+ char *kernel_dir;
+ char *kernel_dev;
unsigned long data_page;
- unsigned long type_page;
- unsigned long dev_page;
- char *dir_page;
- retval = copy_mount_options(type, &type_page);
- if (retval < 0)
- return retval;
+ ret = copy_mount_string(type, &kernel_type);
+ if (ret < 0)
+ goto out_type;
- dir_page = getname(dir_name);
- retval = PTR_ERR(dir_page);
- if (IS_ERR(dir_page))
- goto out1;
+ kernel_dir = getname(dir_name);
+ if (IS_ERR(kernel_dir)) {
+ ret = PTR_ERR(kernel_dir);
+ goto out_dir;
+ }
- retval = copy_mount_options(dev_name, &dev_page);
- if (retval < 0)
- goto out2;
+ ret = copy_mount_string(dev_name, &kernel_dev);
+ if (ret < 0)
+ goto out_dev;
- retval = copy_mount_options(data, &data_page);
- if (retval < 0)
- goto out3;
+ ret = copy_mount_options(data, &data_page);
+ if (ret < 0)
+ goto out_data;
- retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
- flags, (void *)data_page);
- free_page(data_page);
+ ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+ (void *) data_page);
-out3:
- free_page(dev_page);
-out2:
- putname(dir_page);
-out1:
- free_page(type_page);
- return retval;
+ free_page(data_page);
+out_data:
+ kfree(kernel_dev);
+out_dev:
+ putname(kernel_dir);
+out_dir:
+ kfree(kernel_type);
+out_type:
+ return ret;
}
/*
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c590722d87e..b8b5b30d53f0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1241,7 +1241,7 @@ ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
month = 2;
} else {
nl_day = (year & 3) || day <= 59 ? day : day - 1;
- for (month = 0; month < 12; month++)
+ for (month = 1; month < 12; month++)
if (day_n[month] > nl_day)
break;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b99ce205b1bd..cf98da1be23e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
#ifdef CONFIG_NCPFS_NLS
/* unload the NLS charsets */
- if (server->nls_vol)
- {
- unload_nls(server->nls_vol);
- server->nls_vol = NULL;
- }
- if (server->nls_io)
- {
- unload_nls(server->nls_io);
- server->nls_io = NULL;
- }
+ unload_nls(server->nls_vol);
+ unload_nls(server->nls_io);
#endif /* CONFIG_NCPFS_NLS */
if (server->info_filp)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fa038df63ac8..0d58caf4a6e1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
oldset_io = server->nls_io;
server->nls_io = iocharset;
- if (oldset_cp)
- unload_nls(oldset_cp);
- if (oldset_io)
- unload_nls(oldset_io);
+ unload_nls(oldset_cp);
+ unload_nls(oldset_io);
return 0;
}
@@ -442,7 +440,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
if (dentry) {
struct inode* s_inode = dentry->d_inode;
- if (inode) {
+ if (s_inode) {
NCP_FINFO(s_inode)->volNumber = vnum;
NCP_FINFO(s_inode)->dirEntNum = de;
NCP_FINFO(s_inode)->DosDirNum = dosde;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee326..15458decdb8a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
return VM_FAULT_MAJOR;
}
-static struct vm_operations_struct ncp_file_mmap =
+static const struct vm_operations_struct ncp_file_mmap =
{
.fault = ncp_file_mmap_fault,
};
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a77bc25d5af..59e5673b4597 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -90,7 +90,7 @@ config ROOT_NFS
If you want your system to mount its root file system via NFS,
choose Y here. This is common practice for managing systems
without local permanent storage. For details, read
- <file:Documentation/filesystems/nfsroot.txt>.
+ <file:Documentation/filesystems/nfs/nfsroot.txt>.
Most people say N here.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
direct.o pagelist.o proc.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o
+ write.o namespace.o mount_clnt.o \
+ dns_resolve.o cache_lib.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (atomic_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete_all(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ atomic_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ atomic_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ int ret;
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+ if (ret)
+ goto err;
+ ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+ cd->name, 0600, cd);
+ path_put(&nd.path);
+ if (!ret)
+ return ret;
+err:
+ rpc_put_mount();
+ return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+ rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
{
- char *endp;
- int num = simple_strtol(val, &endp, 0);
- if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = strict_strtoul(val, 0, &num);
+ if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
return -EINVAL;
- *((int *)kp->arg) = num;
+ *((unsigned int *)kp->arg) = num;
return 0;
}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
- &nfs_callback_set_tcpport, 0644);
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
+{
+ return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
/*
* This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e5a2dac5f715..76b0aa0f73bf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
p = read_buf(xdr, len);
if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);;
+ return htonl(NFS4ERR_RESOURCE);
memcpy(sid->data, p, len);
return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..99ea196f071f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
.hostname = clp->cl_hostname,
.address = (struct sockaddr *)&clp->cl_addr,
.addrlen = clp->cl_addrlen,
- .protocol = server->flags & NFS_MOUNT_TCP ?
- IPPROTO_TCP : IPPROTO_UDP,
.nfs_version = clp->rpc_ops->version,
.noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
if (server->flags & NFS_MOUNT_NONLM)
return 0;
+ switch (clp->cl_proto) {
+ default:
+ nlm_init.protocol = IPPROTO_TCP;
+ break;
+ case XPRT_TRANSPORT_UDP:
+ nlm_init.protocol = IPPROTO_UDP;
+ }
+
host = nlmclnt_init(&nlm_init);
if (IS_ERR(host))
return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
dprintk("--> nfs_init_server()\n");
#ifdef CONFIG_NFS_V3
- if (data->flags & NFS_MOUNT_VER3)
+ if (data->version == 3)
cl_init.rpc_ops = &nfs_v3_clientops;
#endif
@@ -809,6 +815,9 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->options = data->options;
+ server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +888,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
if (server->wsize > max_rpc_payload)
@@ -929,10 +939,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
goto out_error;
nfs_server_set_fsinfo(server, &fsinfo);
- error = bdi_init(&server->backing_dev_info);
- if (error)
- goto out_error;
-
/* Get some general file system info */
if (server->namelen == 0) {
@@ -964,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
target->acdirmin = source->acdirmin;
target->acdirmax = source->acdirmax;
target->caps = source->caps;
+ target->options = source->options;
}
/*
@@ -991,6 +998,12 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ if (bdi_init(&server->backing_dev_info)) {
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+ return NULL;
+ }
+
return server;
}
@@ -1074,10 +1087,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1171,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1, flags & NFS_MOUNT_NORESVPORT);
if (error < 0)
goto error;
- memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
error = nfs_idmap_new(clp);
if (error < 0) {
@@ -1274,7 +1283,7 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
server->options = data->options;
/* Get a client record */
@@ -1359,10 +1368,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1405,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Initialise the client representation from the parent server */
nfs_server_copy_userdata(server, parent_server);
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
@@ -1533,7 +1538,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
static void nfs_server_list_stop(struct seq_file *p, void *v);
static int nfs_server_list_show(struct seq_file *m, void *v);
-static struct seq_operations nfs_server_list_ops = {
+static const struct seq_operations nfs_server_list_ops = {
.start = nfs_server_list_start,
.next = nfs_server_list_next,
.stop = nfs_server_list_stop,
@@ -1554,7 +1559,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
static void nfs_volume_list_stop(struct seq_file *p, void *v);
static int nfs_volume_list_show(struct seq_file *m, void *v);
-static struct seq_operations nfs_volume_list_ops = {
+static const struct seq_operations nfs_volume_list_ops = {
.start = nfs_volume_list_start,
.next = nfs_volume_list_next,
.stop = nfs_volume_list_stop,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32062c33c859..7cb298525eef 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
+ nfs_inode_return_delegation(inode);
+
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4e089a8f294..e1d415e97849 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
};
struct rpc_task_setup task_setup_data = {
.rpc_client = NFS_CLIENT(inode),
+ .rpc_message = &msg,
.callback_ops = &nfs_write_direct_ops,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
@@ -934,9 +935,6 @@ out:
* back into its cache. We let the server do generic write
* parameter checking and report problems.
*
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
* We eliminate local atime updates, see direct read above.
*
* We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+};
+
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+ int ret;
+
+ ret = nfs_cache_upcall(cd, key->hostname);
+ if (ret)
+ ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = (long)item->h.expiry_time - (long)get_seconds();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned long ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ ttl = get_expiry(&buf);
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + get_seconds();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .hash_table = nfs_dns_table,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_init,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < get_seconds()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+
+ ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(sa, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, &nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+ return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ nfs_cache_unregister(&nfs_dns_resolve);
+}
+
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..6b891328f332 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
-static struct vm_operations_struct nfs_file_vm_ops;
+static const struct vm_operations_struct nfs_file_vm_ops;
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
}
/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned len)
+{
+ unsigned int pglen = nfs_page_length(page);
+ unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int end = offset + len;
+
+ if ((file->f_mode & FMODE_READ) && /* open for read? */
+ !PageUptodate(page) && /* Uptodate? */
+ !PagePrivate(page) && /* i/o request already? */
+ pglen && /* valid bytes of file? */
+ (end < pglen || offset)) /* replace all valid bytes? */
+ return 1;
+ return 0;
+}
+
+/*
* This does the "real" work of the write. We must allocate and lock the
* page to be sent back to the generic routine, which then copies the
* data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int ret;
- pgoff_t index;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
- index = pos >> PAGE_CACHE_SHIFT;
+ int once_thru = 0;
dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+start:
/*
* Prevent starvation issues if someone is doing a consistency
* sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
if (ret) {
unlock_page(page);
page_cache_release(page);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+ ret = nfs_readpage(file, page);
+ page_cache_release(page);
+ if (!ret)
+ goto start;
}
return ret;
}
@@ -479,7 +523,9 @@ const struct address_space_operations nfs_file_aops = {
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
+ .migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
+ .error_remove_page = generic_error_remove_page,
};
/*
@@ -526,7 +572,7 @@ out_unlock:
return VM_FAULT_SIGBUS;
}
-static struct vm_operations_struct nfs_file_vm_ops = {
+static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nfs_vm_page_mkwrite,
};
@@ -535,7 +581,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx;
- if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+ if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
return 1;
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
@@ -576,7 +622,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
- /* Return error values for O_SYNC and IS_SYNC() */
+ /* Return error values for O_DSYNC and IS_SYNC() */
if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
if (err < 0)
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7e..70fad69eb959 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
/*
* Get the cache cookie for an NFS superblock. We have to handle
* uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
*/
-void nfs_fscache_get_super_cookie(struct super_block *sb,
- struct nfs_parsed_mount_data *data)
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
+ struct nfs_clone_mount *mntdata)
{
struct nfs_fscache_key *key, *xkey;
struct nfs_server *nfss = NFS_SB(sb);
struct rb_node **p, *parent;
- const char *uniq = data->fscache_uniq ?: "";
int diff, ulen;
- ulen = strlen(uniq);
+ if (uniq) {
+ ulen = strlen(uniq);
+ } else if (mntdata) {
+ struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
+ if (mnt_s->fscache_key) {
+ uniq = mnt_s->fscache_key->key.uniquifier;
+ ulen = mnt_s->fscache_key->key.uniq_len;
+ }
+ }
+
+ if (!uniq) {
+ uniq = "";
+ ulen = 1;
+ }
+
key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
if (!key)
return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff08..b9c572d0679f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
extern void nfs_fscache_release_client_cookie(struct nfs_client *);
extern void nfs_fscache_get_super_cookie(struct super_block *,
- struct nfs_parsed_mount_data *);
+ const char *,
+ struct nfs_clone_mount *);
extern void nfs_fscache_release_super_cookie(struct super_block *);
extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
static inline void nfs_fscache_get_super_cookie(
struct super_block *sb,
- struct nfs_parsed_mount_data *data)
+ const char *uniq,
+ struct nfs_clone_mount *mntdata)
{
}
static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
static unsigned int fnvhash32(const void *, size_t);
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
.upcall = idmap_pipe_upcall,
.downcall = idmap_pipe_downcall,
.destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
if (idmap == NULL)
return -ENOMEM;
- idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
- idmap, &idmap_upcall_ops, 0);
+ idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+ "idmap", idmap, &idmap_upcall_ops, 0);
if (IS_ERR(idmap->idmap_dentry)) {
error = PTR_ERR(idmap->idmap_dentry);
kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..faa091865ad0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "dns_resolve.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
/* We can't support update_atime(), since the server will reset it */
inode->i_flags |= S_NOATIME|S_NOCMTIME;
inode->i_mode = fattr->mode;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && nfs_server_capable(inode, NFS_CAP_MODE))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
+ else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
+ else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
+ else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
nfsi->change_attr = fattr->change_attr;
+ else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE;
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
inode->i_nlink = fattr->nlink;
+ else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -426,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
*/
static int nfs_vmtruncate(struct inode * inode, loff_t offset)
{
- if (i_size_read(inode) < offset) {
- unsigned long limit;
-
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out_big;
- spin_lock(&inode->i_lock);
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
- } else {
- struct address_space *mapping = inode->i_mapping;
+ loff_t oldsize;
+ int err;
- /*
- * truncation of in-use swapfiles is disallowed - it would
- * cause subsequent swapout to scribble on the now-freed
- * blocks.
- */
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
- spin_lock(&inode->i_lock);
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
+ err = inode_newsize_ok(inode, offset);
+ if (err)
+ goto out;
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- }
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
+ spin_lock(&inode->i_lock);
+ oldsize = inode->i_size;
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ truncate_pagecache(inode, oldsize, offset);
+out:
+ return err;
}
/**
@@ -1145,6 +1149,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
loff_t cur_isize, new_isize;
unsigned long invalid = 0;
unsigned long now = jiffies;
+ unsigned long save_cache_validity;
dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1176,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1195,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
nfsi->change_attr = fattr->change_attr;
}
- }
+ } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+ invalid |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
/* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1208,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
}
- }
+ } else if (server->caps & NFS_CAP_MTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
/* If ctime has changed we should definitely clear access+acl caches */
if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1227,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
}
- }
+ } else if (server->caps & NFS_CAP_CTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1247,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dprintk("NFS: isize change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
}
- }
+ } else
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+ else if (server->caps & NFS_CAP_ATIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_mode = fattr->mode;
}
- }
+ } else if (server->caps & NFS_CAP_MODE)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (inode->i_uid != fattr->uid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (inode->i_gid != fattr->gid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1299,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
inode->i_nlink = fattr->nlink;
}
- }
+ } else if (server->caps & NFS_CAP_NLINK)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
@@ -1293,9 +1331,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
|| S_ISLNK(inode->i_mode)))
invalid &= ~NFS_INO_INVALID_DATA;
if (!nfs_have_delegation(inode, FMODE_READ) ||
- (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+ (save_cache_validity & NFS_INO_REVAL_FORCED))
nfsi->cache_validity |= invalid;
- nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
return 0;
out_changed:
@@ -1442,6 +1479,10 @@ static int __init init_nfs_fs(void)
{
int err;
+ err = nfs_dns_resolver_init();
+ if (err < 0)
+ goto out8;
+
err = nfs_fscache_register();
if (err < 0)
goto out7;
@@ -1500,6 +1541,8 @@ out5:
out6:
nfs_fscache_unregister();
out7:
+ nfs_dns_resolver_destroy();
+out8:
return err;
}
@@ -1511,6 +1554,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
+ nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister("nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
#define NFS_MAX_SECFLAVORS (12)
/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT (-1)
+
+/*
* In-kernel mount arguments
*/
struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
unsigned int auth_flavor_len;
rpc_authflavor_t auth_flavors[1];
char *client_address;
+ unsigned int version;
unsigned int minorversion;
char *fscache_uniq;
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
size_t addrlen;
char *hostname;
u32 version;
- unsigned short port;
+ int port;
unsigned short protocol;
} mount_server;
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
size_t addrlen;
char *hostname;
char *export_path;
- unsigned short port;
+ int port;
unsigned short protocol;
} nfs_server;
@@ -102,6 +108,7 @@ struct nfs_mount_request {
};
extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
/* client.c */
extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
extern int nfs_wait_bit_killable(void *word);
/* super.c */
-void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
extern struct file_system_type nfs_xdev_fs_type;
#ifdef CONFIG_NFS_V4
extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
/* write.c */
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+ struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
/* nfs4proc.c */
extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
return ((unsigned long)len + (unsigned long)base +
PAGE_SIZE - 1) >> PAGE_SHIFT;
}
-
-#define IPV6_SCOPE_DELIMITER '%'
-
-/*
- * Set the port number in an address. Be agnostic about the address
- * family.
- */
-static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
-
- switch (sap->sa_family) {
- case AF_INET:
- ap->sin_port = htons(port);
- break;
- case AF_INET6:
- ap6->sin6_port = htons(port);
- break;
- }
-}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
goto out;
}
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+ static const struct rpc_timeout nfs_umnt_timeout = {
+ .to_initval = 1 * HZ,
+ .to_maxval = 3 * HZ,
+ .to_retries = 2,
+ };
+ struct rpc_create_args args = {
+ .protocol = IPPROTO_UDP,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .timeout = &nfs_umnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ };
+ struct mountres result;
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ .rpc_resp = &result,
+ };
+ struct rpc_clnt *clnt;
+ int status;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ clnt = rpc_create(&args);
+ if (unlikely(IS_ERR(clnt)))
+ goto out_clnt_err;
+
+ dprintk("NFS: sending UMNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"), info->dirpath);
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+ else
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+ status = rpc_call_sync(clnt, &msg, 0);
+ rpc_shutdown_client(clnt);
+
+ if (unlikely(status < 0))
+ goto out_call_err;
+
+ return;
+
+out_clnt_err:
+ dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+ PTR_ERR(clnt));
+ return;
+
+out_call_err:
+ dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
/*
* XDR encode/decode functions for MOUNT
*/
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
return -EIO;
status = ntohl(*p);
- for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+ for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
if (mnt_errtbl[i].status == status) {
res->errno = mnt_errtbl[i].errno;
return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
return -EIO;
status = ntohl(*p);
- for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+ for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
if (mnt3_errtbl[i].status == status) {
res->errno = mnt3_errtbl[i].errno;
return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
.p_statidx = MOUNTPROC_MNT,
.p_name = "MOUNT",
},
+ [MOUNTPROC_UMNT] = {
+ .p_proc = MOUNTPROC_UMNT,
+ .p_encode = (kxdrproc_t)mnt_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC_UMNT,
+ .p_name = "UMOUNT",
+ },
};
static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
.p_statidx = MOUNTPROC3_MNT,
.p_name = "MOUNT",
},
+ [MOUNTPROC3_UMNT] = {
+ .p_proc = MOUNTPROC3_UMNT,
+ .p_encode = (kxdrproc_t)mnt_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC3_UMNT,
+ .p_name = "UMOUNT",
+ },
};
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
*/
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
@@ -299,7 +298,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
/*
* Create a regular file.
- * For now, we don't implement O_EXCL.
*/
static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..fa3408f20112 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
#include <linux/inet.h>
#include "internal.h"
#include "nfs4_fs.h"
+#include "dns_resolve.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
return 0;
}
+static size_t nfs_parse_server_name(char *string, size_t len,
+ struct sockaddr *sa, size_t salen)
+{
+ ssize_t ret;
+
+ ret = rpc_pton(string, len, sa, salen);
+ if (ret == 0) {
+ ret = nfs_dns_resolve_name(string, len, sa, salen);
+ if (ret < 0)
+ ret = 0;
+ }
+ return ret;
+}
+
static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
char *page, char *page2,
const struct nfs4_fs_location *location)
@@ -106,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
if (IS_ERR(mnt_path))
- return mnt;
+ return ERR_CAST(mnt_path);
mountdata->mnt_path = mnt_path;
maxbuflen = mnt_path - 1 - page2;
@@ -117,15 +132,16 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
if (buf->len <= 0 || buf->len >= maxbuflen)
continue;
- mountdata->addr = (struct sockaddr *)&addr;
-
if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
continue;
- nfs_parse_ip_address(buf->data, buf->len,
- mountdata->addr, &mountdata->addrlen);
- if (mountdata->addr->sa_family == AF_UNSPEC)
+
+ mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+ (struct sockaddr *)&addr, sizeof(addr));
+ if (mountdata->addrlen == 0)
continue;
- nfs_set_port(mountdata->addr, NFS_PORT);
+
+ mountdata->addr = (struct sockaddr *)&addr;
+ rpc_set_port(mountdata->addr, NFS_PORT);
memcpy(page2, buf->data, buf->len);
page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..741a562177fc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
*/
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/string.h>
@@ -61,6 +60,8 @@
#define NFS4_POLL_RETRY_MIN (HZ/10)
#define NFS4_POLL_RETRY_MAX (15*HZ)
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -71,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
{
- if (err < -1000) {
+ if (err >= -1000)
+ return err;
+ switch (err) {
+ case -NFS4ERR_RESOURCE:
+ return -EREMOTEIO;
+ default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
- return -EIO;
+ break;
}
- return err;
+ return -EIO;
}
/*
@@ -426,17 +432,19 @@ out:
static int nfs4_recover_session(struct nfs4_session *session)
{
struct nfs_client *clp = session->clp;
+ unsigned int loop;
int ret;
- for (;;) {
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
ret = nfs4_wait_clnt_recover(clp);
if (ret != 0)
- return ret;
+ break;
if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
break;
nfs4_schedule_state_manager(clp);
+ ret = -EIO;
}
- return 0;
+ return ret;
}
static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1452,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
static int nfs4_recover_expired_lease(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
+ unsigned int loop;
int ret;
- for (;;) {
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
ret = nfs4_wait_clnt_recover(clp);
if (ret != 0)
- return ret;
+ break;
if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
!test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
break;
nfs4_schedule_state_recovery(clp);
+ ret = -EIO;
}
- return 0;
+ return ret;
}
/*
@@ -1997,12 +2007,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
status = nfs4_call_sync(server, &msg, &args, &res, 0);
if (status == 0) {
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+ server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+ NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+ NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+ NFS_CAP_CTIME|NFS_CAP_MTIME);
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
server->caps |= NFS_CAP_ACLS;
if (res.has_links != 0)
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+ if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+ server->caps |= NFS_CAP_FILEID;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+ server->caps |= NFS_CAP_MODE;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+ server->caps |= NFS_CAP_NLINK;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+ server->caps |= NFS_CAP_OWNER;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+ server->caps |= NFS_CAP_OWNER_GROUP;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+ server->caps |= NFS_CAP_ATIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+ server->caps |= NFS_CAP_CTIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+ server->caps |= NFS_CAP_MTIME;
+
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2735,7 +2767,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.pages = &page,
.pgbase = 0,
.count = count,
- .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
+ .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
};
struct nfs4_readdir_res res;
struct rpc_message msg = {
@@ -3033,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
if (time_before(clp->cl_last_renewal,timestamp))
clp->cl_last_renewal = timestamp;
spin_unlock(&clp->cl_lock);
- dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
- task->tk_msg.rpc_cred);
- put_rpccred(task->tk_msg.rpc_cred);
}
static const struct rpc_call_ops nfs4_renew_ops = {
@@ -4850,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
- put_rpccred(task->tk_msg.rpc_cred);
kfree(task->tk_msg.rpc_argp);
kfree(task->tk_msg.rpc_resp);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6cef18f2..0156c01c212c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
}
void
-nfs4_renewd_prepare_shutdown(struct nfs_server *server)
-{
- cancel_delayed_work(&server->nfs_client->cl_renewd);
-}
-
-void
nfs4_kill_renewd(struct nfs_client *clp)
{
cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
}
-static struct file_lock_operations nfs4_fl_lock_ops = {
+static const struct file_lock_operations nfs4_fl_lock_ops = {
.fl_copy_lock = nfs4_fl_copy_lock,
.fl_release_private = nfs4_fl_release_lock,
};
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..20b4e30e6c82 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
@@ -702,29 +701,12 @@ struct compound_hdr {
u32 minorversion;
};
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- * These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define WRITE32(n) *p++ = htonl(n)
-#define WRITE64(n) do { \
- *p++ = htonl((uint32_t)((n) >> 32)); \
- *p++ = htonl((uint32_t)(n)); \
-} while (0)
-#define WRITEMEM(ptr,nbytes) do { \
- p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
-} while (0)
-
-#define RESERVE_SPACE(nbytes) do { \
- p = xdr_reserve_space(xdr, nbytes); \
- BUG_ON(!p); \
-} while (0)
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr_reserve_space(xdr, nbytes);
+ BUG_ON(!p);
+ return p;
+}
static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
@@ -749,12 +731,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
- RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
- WRITE32(hdr->taglen);
- WRITEMEM(hdr->tag, hdr->taglen);
- WRITE32(hdr->minorversion);
+ p = reserve_space(xdr, 4 + hdr->taglen + 8);
+ p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
+ *p++ = cpu_to_be32(hdr->minorversion);
hdr->nops_p = p;
- WRITE32(hdr->nops);
+ *p = cpu_to_be32(hdr->nops);
}
static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +810,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len += 16;
else if (iap->ia_valid & ATTR_MTIME)
len += 4;
- RESERVE_SPACE(len);
+ p = reserve_space(xdr, len);
/*
* We write the bitmap length now, but leave the bitmap and the attribute
* buffer length to be backfilled at the end of this routine.
*/
- WRITE32(2);
+ *p++ = cpu_to_be32(2);
q = p;
p += 3;
if (iap->ia_valid & ATTR_SIZE) {
bmval0 |= FATTR4_WORD0_SIZE;
- WRITE64(iap->ia_size);
+ p = xdr_encode_hyper(p, iap->ia_size);
}
if (iap->ia_valid & ATTR_MODE) {
bmval1 |= FATTR4_WORD1_MODE;
- WRITE32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
}
if (iap->ia_valid & ATTR_UID) {
bmval1 |= FATTR4_WORD1_OWNER;
- WRITE32(owner_namelen);
- WRITEMEM(owner_name, owner_namelen);
+ p = xdr_encode_opaque(p, owner_name, owner_namelen);
}
if (iap->ia_valid & ATTR_GID) {
bmval1 |= FATTR4_WORD1_OWNER_GROUP;
- WRITE32(owner_grouplen);
- WRITEMEM(owner_group, owner_grouplen);
+ p = xdr_encode_opaque(p, owner_group, owner_grouplen);
}
if (iap->ia_valid & ATTR_ATIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- WRITE32(NFS4_SET_TO_CLIENT_TIME);
- WRITE32(0);
- WRITE32(iap->ia_mtime.tv_sec);
- WRITE32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
}
else if (iap->ia_valid & ATTR_ATIME) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- WRITE32(NFS4_SET_TO_SERVER_TIME);
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
if (iap->ia_valid & ATTR_MTIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- WRITE32(NFS4_SET_TO_CLIENT_TIME);
- WRITE32(0);
- WRITE32(iap->ia_mtime.tv_sec);
- WRITE32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
}
else if (iap->ia_valid & ATTR_MTIME) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- WRITE32(NFS4_SET_TO_SERVER_TIME);
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
/*
@@ -891,7 +870,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len = (char *)p - (char *)q - 12;
*q++ = htonl(bmval0);
*q++ = htonl(bmval1);
- *q++ = htonl(len);
+ *q = htonl(len);
/* out: */
}
@@ -900,9 +879,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
{
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(OP_ACCESS);
- WRITE32(access);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_ACCESS);
+ *p = cpu_to_be32(access);
hdr->nops++;
hdr->replen += decode_access_maxsz;
}
@@ -911,10 +890,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
{
__be32 *p;
- RESERVE_SPACE(8+NFS4_STATEID_SIZE);
- WRITE32(OP_CLOSE);
- WRITE32(arg->seqid->sequence->counter);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_CLOSE);
+ *p++ = cpu_to_be32(arg->seqid->sequence->counter);
+ xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_close_maxsz;
}
@@ -923,10 +902,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
{
__be32 *p;
- RESERVE_SPACE(16);
- WRITE32(OP_COMMIT);
- WRITE64(args->offset);
- WRITE32(args->count);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(OP_COMMIT);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
hdr->nops++;
hdr->replen += decode_commit_maxsz;
}
@@ -935,30 +914,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
{
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(OP_CREATE);
- WRITE32(create->ftype);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_CREATE);
+ *p = cpu_to_be32(create->ftype);
switch (create->ftype) {
case NF4LNK:
- RESERVE_SPACE(4);
- WRITE32(create->u.symlink.len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(create->u.symlink.len);
xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
break;
case NF4BLK: case NF4CHR:
- RESERVE_SPACE(8);
- WRITE32(create->u.device.specdata1);
- WRITE32(create->u.device.specdata2);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(create->u.device.specdata1);
+ *p = cpu_to_be32(create->u.device.specdata2);
break;
default:
break;
}
- RESERVE_SPACE(4 + create->name->len);
- WRITE32(create->name->len);
- WRITEMEM(create->name->name, create->name->len);
+ encode_string(xdr, create->name->len, create->name->name);
hdr->nops++;
hdr->replen += decode_create_maxsz;
@@ -969,10 +946,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
{
__be32 *p;
- RESERVE_SPACE(12);
- WRITE32(OP_GETATTR);
- WRITE32(1);
- WRITE32(bitmap);
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(OP_GETATTR);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(bitmap);
hdr->nops++;
hdr->replen += decode_getattr_maxsz;
}
@@ -981,11 +958,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
{
__be32 *p;
- RESERVE_SPACE(16);
- WRITE32(OP_GETATTR);
- WRITE32(2);
- WRITE32(bm0);
- WRITE32(bm1);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(OP_GETATTR);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bm0);
+ *p = cpu_to_be32(bm1);
hdr->nops++;
hdr->replen += decode_getattr_maxsz;
}
@@ -1012,8 +989,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_GETFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_GETFH);
hdr->nops++;
hdr->replen += decode_getfh_maxsz;
}
@@ -1022,10 +999,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
{
__be32 *p;
- RESERVE_SPACE(8 + name->len);
- WRITE32(OP_LINK);
- WRITE32(name->len);
- WRITEMEM(name->name, name->len);
+ p = reserve_space(xdr, 8 + name->len);
+ *p++ = cpu_to_be32(OP_LINK);
+ xdr_encode_opaque(p, name->name, name->len);
hdr->nops++;
hdr->replen += decode_link_maxsz;
}
@@ -1052,27 +1028,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
{
__be32 *p;
- RESERVE_SPACE(32);
- WRITE32(OP_LOCK);
- WRITE32(nfs4_lock_type(args->fl, args->block));
- WRITE32(args->reclaim);
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
- WRITE32(args->new_lock_owner);
+ p = reserve_space(xdr, 32);
+ *p++ = cpu_to_be32(OP_LOCK);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+ *p++ = cpu_to_be32(args->reclaim);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ *p = cpu_to_be32(args->new_lock_owner);
if (args->new_lock_owner){
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
- WRITE32(args->open_seqid->sequence->counter);
- WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
- WRITE32(args->lock_seqid->sequence->counter);
- WRITE64(args->lock_owner.clientid);
- WRITE32(16);
- WRITEMEM("lock id:", 8);
- WRITE64(args->lock_owner.id);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
+ *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
+ p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+ p = xdr_encode_hyper(p, args->lock_owner.clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ xdr_encode_hyper(p, args->lock_owner.id);
}
else {
- RESERVE_SPACE(NFS4_STATEID_SIZE+4);
- WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
- WRITE32(args->lock_seqid->sequence->counter);
+ p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
+ p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(args->lock_seqid->sequence->counter);
}
hdr->nops++;
hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1058,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
{
__be32 *p;
- RESERVE_SPACE(52);
- WRITE32(OP_LOCKT);
- WRITE32(nfs4_lock_type(args->fl, 0));
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
- WRITE64(args->lock_owner.clientid);
- WRITE32(16);
- WRITEMEM("lock id:", 8);
- WRITE64(args->lock_owner.id);
+ p = reserve_space(xdr, 52);
+ *p++ = cpu_to_be32(OP_LOCKT);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ p = xdr_encode_hyper(p, args->lock_owner.clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ xdr_encode_hyper(p, args->lock_owner.id);
hdr->nops++;
hdr->replen += decode_lockt_maxsz;
}
@@ -1099,13 +1075,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
{
__be32 *p;
- RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
- WRITE32(OP_LOCKU);
- WRITE32(nfs4_lock_type(args->fl, 0));
- WRITE32(args->seqid->sequence->counter);
- WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
+ p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
+ *p++ = cpu_to_be32(OP_LOCKU);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ *p++ = cpu_to_be32(args->seqid->sequence->counter);
+ p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ xdr_encode_hyper(p, nfs4_lock_length(args->fl));
hdr->nops++;
hdr->replen += decode_locku_maxsz;
}
@@ -1115,10 +1091,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
int len = name->len;
__be32 *p;
- RESERVE_SPACE(8 + len);
- WRITE32(OP_LOOKUP);
- WRITE32(len);
- WRITEMEM(name->name, len);
+ p = reserve_space(xdr, 8 + len);
+ *p++ = cpu_to_be32(OP_LOOKUP);
+ xdr_encode_opaque(p, name->name, len);
hdr->nops++;
hdr->replen += decode_lookup_maxsz;
}
@@ -1127,21 +1102,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
{
__be32 *p;
- RESERVE_SPACE(8);
+ p = reserve_space(xdr, 8);
switch (fmode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
- WRITE32(NFS4_SHARE_ACCESS_READ);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
break;
case FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_WRITE);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
break;
case FMODE_READ|FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_BOTH);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
break;
default:
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
- WRITE32(0); /* for linux, share_deny = 0 always */
+ *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
}
static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1126,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
* owner 4 = 32
*/
- RESERVE_SPACE(8);
- WRITE32(OP_OPEN);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_OPEN);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->fmode);
- RESERVE_SPACE(28);
- WRITE64(arg->clientid);
- WRITE32(16);
- WRITEMEM("open id:", 8);
- WRITE64(arg->id);
+ p = reserve_space(xdr, 28);
+ p = xdr_encode_hyper(p, arg->clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "open id:", 8);
+ xdr_encode_hyper(p, arg->id);
}
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch(arg->open_flags & O_EXCL) {
case 0:
- WRITE32(NFS4_CREATE_UNCHECKED);
+ *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
encode_attrs(xdr, arg->u.attrs, arg->server);
break;
default:
- WRITE32(NFS4_CREATE_EXCLUSIVE);
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
encode_nfs4_verifier(xdr, &arg->u.verifier);
}
}
@@ -1182,14 +1157,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch (arg->open_flags & O_CREAT) {
case 0:
- WRITE32(NFS4_OPEN_NOCREATE);
+ *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
break;
default:
BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
- WRITE32(NFS4_OPEN_CREATE);
+ *p = cpu_to_be32(NFS4_OPEN_CREATE);
encode_createmode(xdr, arg);
}
}
@@ -1198,16 +1173,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch (delegation_type) {
case 0:
- WRITE32(NFS4_OPEN_DELEGATE_NONE);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
break;
case FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_READ);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
break;
case FMODE_WRITE|FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
break;
default:
BUG();
@@ -1218,8 +1193,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(NFS4_OPEN_CLAIM_NULL);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
encode_string(xdr, name->len, name->name);
}
@@ -1227,8 +1202,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
encode_delegation_type(xdr, type);
}
@@ -1236,9 +1211,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
- WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+ xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
encode_string(xdr, name->len, name->name);
}
@@ -1267,10 +1242,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
- WRITE32(OP_OPEN_CONFIRM);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+ *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
+ p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
hdr->nops++;
hdr->replen += decode_open_confirm_maxsz;
}
@@ -1279,10 +1254,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
- WRITE32(OP_OPEN_DOWNGRADE);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+ *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
+ p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->fmode);
hdr->nops++;
hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1269,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
int len = fh->size;
__be32 *p;
- RESERVE_SPACE(8 + len);
- WRITE32(OP_PUTFH);
- WRITE32(len);
- WRITEMEM(fh->data, len);
+ p = reserve_space(xdr, 8 + len);
+ *p++ = cpu_to_be32(OP_PUTFH);
+ xdr_encode_opaque(p, fh->data, len);
hdr->nops++;
hdr->replen += decode_putfh_maxsz;
}
@@ -1306,8 +1280,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_PUTROOTFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_PUTROOTFH);
hdr->nops++;
hdr->replen += decode_putrootfh_maxsz;
}
@@ -1317,26 +1291,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
nfs4_stateid stateid;
__be32 *p;
- RESERVE_SPACE(NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, NFS4_STATEID_SIZE);
if (ctx->state != NULL) {
nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
- WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+ xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
} else
- WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+ xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
}
static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_READ);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_READ);
encode_stateid(xdr, args->context);
- RESERVE_SPACE(12);
- WRITE64(args->offset);
- WRITE32(args->count);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
hdr->nops++;
hdr->replen += decode_read_maxsz;
}
@@ -1349,20 +1323,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
};
__be32 *p;
- RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
- WRITE32(OP_READDIR);
- WRITE64(readdir->cookie);
- WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
- WRITE32(readdir->count >> 1); /* We're not doing readdirplus */
- WRITE32(readdir->count);
- WRITE32(2);
+ p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
+ *p++ = cpu_to_be32(OP_READDIR);
+ p = xdr_encode_hyper(p, readdir->cookie);
+ p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */
+ *p++ = cpu_to_be32(readdir->count);
+ *p++ = cpu_to_be32(2);
/* Switch to mounted_on_fileid if the server supports it */
if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
attrs[0] &= ~FATTR4_WORD0_FILEID;
else
attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
- WRITE32(attrs[0] & readdir->bitmask[0]);
- WRITE32(attrs[1] & readdir->bitmask[1]);
+ *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+ *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
hdr->nops++;
hdr->replen += decode_readdir_maxsz;
dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1352,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_READLINK);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_READLINK);
hdr->nops++;
hdr->replen += decode_readlink_maxsz;
}
@@ -1388,10 +1362,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
{
__be32 *p;
- RESERVE_SPACE(8 + name->len);
- WRITE32(OP_REMOVE);
- WRITE32(name->len);
- WRITEMEM(name->name, name->len);
+ p = reserve_space(xdr, 8 + name->len);
+ *p++ = cpu_to_be32(OP_REMOVE);
+ xdr_encode_opaque(p, name->name, name->len);
hdr->nops++;
hdr->replen += decode_remove_maxsz;
}
@@ -1400,14 +1373,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
{
__be32 *p;
- RESERVE_SPACE(8 + oldname->len);
- WRITE32(OP_RENAME);
- WRITE32(oldname->len);
- WRITEMEM(oldname->name, oldname->len);
-
- RESERVE_SPACE(4 + newname->len);
- WRITE32(newname->len);
- WRITEMEM(newname->name, newname->len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_RENAME);
+ encode_string(xdr, oldname->len, oldname->name);
+ encode_string(xdr, newname->len, newname->name);
hdr->nops++;
hdr->replen += decode_rename_maxsz;
}
@@ -1416,9 +1385,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
{
__be32 *p;
- RESERVE_SPACE(12);
- WRITE32(OP_RENEW);
- WRITE64(client_stateid->cl_clientid);
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(OP_RENEW);
+ xdr_encode_hyper(p, client_stateid->cl_clientid);
hdr->nops++;
hdr->replen += decode_renew_maxsz;
}
@@ -1428,8 +1397,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_RESTOREFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_RESTOREFH);
hdr->nops++;
hdr->replen += decode_restorefh_maxsz;
}
@@ -1439,16 +1408,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(OP_SETATTR);
- WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
- RESERVE_SPACE(2*4);
- WRITE32(1);
- WRITE32(FATTR4_WORD0_ACL);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_SETATTR);
+ xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 2*4);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(FATTR4_WORD0_ACL);
if (arg->acl_len % 4)
return -EINVAL;
- RESERVE_SPACE(4);
- WRITE32(arg->acl_len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
hdr->nops++;
hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1429,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_SAVEFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_SAVEFH);
hdr->nops++;
hdr->replen += decode_savefh_maxsz;
}
@@ -1470,9 +1439,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(OP_SETATTR);
- WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_SETATTR);
+ xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_setattr_maxsz;
encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1451,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
{
__be32 *p;
- RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
- WRITE32(OP_SETCLIENTID);
- WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+ p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(OP_SETCLIENTID);
+ xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
- RESERVE_SPACE(4);
- WRITE32(setclientid->sc_prog);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_prog);
encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
- RESERVE_SPACE(4);
- WRITE32(setclientid->sc_cb_ident);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_cb_ident);
hdr->nops++;
hdr->replen += decode_setclientid_maxsz;
}
@@ -1501,10 +1470,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
{
__be32 *p;
- RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
- WRITE32(OP_SETCLIENTID_CONFIRM);
- WRITE64(client_state->cl_clientid);
- WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
+ p = xdr_encode_hyper(p, client_state->cl_clientid);
+ xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
hdr->nops++;
hdr->replen += decode_setclientid_confirm_maxsz;
}
@@ -1513,15 +1482,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_WRITE);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_WRITE);
encode_stateid(xdr, args->context);
- RESERVE_SPACE(16);
- WRITE64(args->offset);
- WRITE32(args->stable);
- WRITE32(args->count);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
hdr->nops++;
@@ -1532,10 +1501,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
- WRITE32(OP_DELEGRETURN);
- WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_DELEGRETURN);
+ xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_delegreturn_maxsz;
}
@@ -1548,16 +1517,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
{
__be32 *p;
- RESERVE_SPACE(4 + sizeof(args->verifier->data));
- WRITE32(OP_EXCHANGE_ID);
- WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+ p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
+ *p++ = cpu_to_be32(OP_EXCHANGE_ID);
+ xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
encode_string(xdr, args->id_len, args->id);
- RESERVE_SPACE(12);
- WRITE32(args->flags);
- WRITE32(0); /* zero length state_protect4_a */
- WRITE32(0); /* zero length implementation id array */
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(args->flags);
+ *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
+ *p = cpu_to_be32(0); /* zero length implementation id array */
hdr->nops++;
hdr->replen += decode_exchange_id_maxsz;
}
@@ -1571,55 +1540,43 @@ static void encode_create_session(struct xdr_stream *xdr,
uint32_t len;
struct nfs_client *clp = args->client;
- RESERVE_SPACE(4);
- WRITE32(OP_CREATE_SESSION);
-
- RESERVE_SPACE(8);
- WRITE64(clp->cl_ex_clid);
+ len = scnprintf(machine_name, sizeof(machine_name), "%s",
+ clp->cl_ipaddr);
- RESERVE_SPACE(8);
- WRITE32(clp->cl_seqid); /*Sequence id */
- WRITE32(args->flags); /*flags */
+ p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
+ *p++ = cpu_to_be32(OP_CREATE_SESSION);
+ p = xdr_encode_hyper(p, clp->cl_ex_clid);
+ *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
+ *p++ = cpu_to_be32(args->flags); /*flags */
- RESERVE_SPACE(2*28); /* 2 channel_attrs */
/* Fore Channel */
- WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
- WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */
- WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */
- WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
- WRITE32(args->fc_attrs.max_ops); /* max operations */
- WRITE32(args->fc_attrs.max_reqs); /* max requests */
- WRITE32(0); /* rdmachannel_attrs */
+ *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
/* Back Channel */
- WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
- WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */
- WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */
- WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
- WRITE32(args->bc_attrs.max_ops); /* max operations */
- WRITE32(args->bc_attrs.max_reqs); /* max requests */
- WRITE32(0); /* rdmachannel_attrs */
-
- RESERVE_SPACE(4);
- WRITE32(args->cb_program); /* cb_program */
-
- RESERVE_SPACE(4); /* # of security flavors */
- WRITE32(1);
-
- RESERVE_SPACE(4);
- WRITE32(RPC_AUTH_UNIX); /* auth_sys */
+ *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ *p++ = cpu_to_be32(args->cb_program); /* cb_program */
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
/* authsys_parms rfc1831 */
- RESERVE_SPACE(4);
- WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
- len = scnprintf(machine_name, sizeof(machine_name), "%s",
- clp->cl_ipaddr);
- RESERVE_SPACE(16 + len);
- WRITE32(len);
- WRITEMEM(machine_name, len);
- WRITE32(0); /* UID */
- WRITE32(0); /* GID */
- WRITE32(0); /* No more gids */
+ *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
+ p = xdr_encode_opaque(p, machine_name, len);
+ *p++ = cpu_to_be32(0); /* UID */
+ *p++ = cpu_to_be32(0); /* GID */
+ *p = cpu_to_be32(0); /* No more gids */
hdr->nops++;
hdr->replen += decode_create_session_maxsz;
}
@@ -1629,9 +1586,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
- WRITE32(OP_DESTROY_SESSION);
- WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(OP_DESTROY_SESSION);
+ xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
hdr->nops++;
hdr->replen += decode_destroy_session_maxsz;
}
@@ -1655,8 +1612,8 @@ static void encode_sequence(struct xdr_stream *xdr,
WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
slot = tp->slots + args->sa_slotid;
- RESERVE_SPACE(4);
- WRITE32(OP_SEQUENCE);
+ p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
+ *p++ = cpu_to_be32(OP_SEQUENCE);
/*
* Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1627,11 @@ static void encode_sequence(struct xdr_stream *xdr,
((u32 *)session->sess_id.data)[3],
slot->seq_nr, args->sa_slotid,
tp->highest_used_slotid, args->sa_cache_this);
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
- WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(slot->seq_nr);
- WRITE32(args->sa_slotid);
- WRITE32(tp->highest_used_slotid);
- WRITE32(args->sa_cache_this);
+ p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(slot->seq_nr);
+ *p++ = cpu_to_be32(args->sa_slotid);
+ *p++ = cpu_to_be32(tp->highest_used_slotid);
+ *p = cpu_to_be32(args->sa_cache_this);
hdr->nops++;
hdr->replen += decode_sequence_maxsz;
#endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2422,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
}
#endif /* CONFIG_NFS_V4_1 */
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- * These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ32(x) (x) = ntohl(*p++)
-#define READ64(x) do { \
- (x) = (u64)ntohl(*p++) << 32; \
- (x) |= ntohl(*p++); \
-} while (0)
-#define READTIME(x) do { \
- p++; \
- (x.tv_sec) = ntohl(*p++); \
- (x.tv_nsec) = ntohl(*p++); \
-} while (0)
-#define COPYMEM(x,nbytes) do { \
- memcpy((x), p, nbytes); \
- p += XDR_QUADLEN(nbytes); \
-} while (0)
-
-#define READ_BUF(nbytes) do { \
- p = xdr_inline_decode(xdr, nbytes); \
- if (unlikely(!p)) { \
- dprintk("nfs: %s: prematurely hit end of receive" \
- " buffer\n", __func__); \
- dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
- __func__, xdr->p, nbytes, xdr->end); \
- return -EIO; \
- } \
-} while (0)
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+ dprintk("nfs: %s: prematurely hit end of receive buffer. "
+ "Remaining buffer length is %tu words.\n",
+ func, xdr->end - xdr->p);
+}
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
__be32 *p;
- READ_BUF(4);
- READ32(*len);
- READ_BUF(*len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, *len);
+ if (unlikely(!p))
+ goto out_overflow;
*string = (char *)p;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- READ_BUF(8);
- READ32(hdr->status);
- READ32(hdr->taglen);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ hdr->status = be32_to_cpup(p++);
+ hdr->taglen = be32_to_cpup(p);
- READ_BUF(hdr->taglen + 4);
+ p = xdr_inline_decode(xdr, hdr->taglen + 4);
+ if (unlikely(!p))
+ goto out_overflow;
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
- READ32(hdr->nops);
+ hdr->nops = be32_to_cpup(p);
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2477,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
uint32_t opnum;
int32_t nfserr;
- READ_BUF(8);
- READ32(opnum);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
if (opnum != expected) {
dprintk("nfs: Server returned operation"
" %d but we issued a request for %d\n",
opnum, expected);
return -EIO;
}
- READ32(nfserr);
+ nfserr = be32_to_cpup(p);
if (nfserr != NFS_OK)
return nfs4_stat_to_errno(nfserr);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
/* Dummy routine */
@@ -2557,8 +2503,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
unsigned int strlen;
char *str;
- READ_BUF(12);
- return decode_opaque_inline(xdr, &strlen, &str);
+ p = xdr_inline_decode(xdr, 12);
+ if (likely(p))
+ return decode_opaque_inline(xdr, &strlen, &str);
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2515,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
uint32_t bmlen;
__be32 *p;
- READ_BUF(4);
- READ32(bmlen);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
bitmap[0] = bitmap[1] = 0;
- READ_BUF((bmlen << 2));
+ p = xdr_inline_decode(xdr, (bmlen << 2));
+ if (unlikely(!p))
+ goto out_overflow;
if (bmlen > 0) {
- READ32(bitmap[0]);
+ bitmap[0] = be32_to_cpup(p++);
if (bmlen > 1)
- READ32(bitmap[1]);
+ bitmap[1] = be32_to_cpup(p);
}
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
{
__be32 *p;
- READ_BUF(4);
- READ32(*attrlen);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *attrlen = be32_to_cpup(p);
*savep = xdr->p;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2570,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
- READ_BUF(4);
- READ32(*type);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *type = be32_to_cpup(p);
if (*type < NF4REG || *type > NF4NAMEDATTR) {
dprintk("%s: bad type %d\n", __func__, *type);
return -EIO;
@@ -2620,6 +2583,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
}
dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2597,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
- READ_BUF(8);
- READ64(*change);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
ret = NFS_ATTR_FATTR_CHANGE;
}
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2621,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
- READ_BUF(8);
- READ64(*size);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2643,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2664,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2687,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
- READ_BUF(16);
- READ64(fsid->major);
- READ64(fsid->minor);
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &fsid->major);
+ xdr_decode_hyper(p, &fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
ret = NFS_ATTR_FATTR_FSID;
}
@@ -2711,6 +2699,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2712,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
}
dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2733,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
}
dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2755,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
- READ_BUF(8);
- READ64(*fileid);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2778,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
- READ_BUF(8);
- READ64(*fileid);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2801,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
}
dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2823,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
}
dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2845,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
}
dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2864,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
__be32 *p;
int status = 0;
- READ_BUF(4);
- READ32(n);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ n = be32_to_cpup(p);
if (n == 0)
goto root_path;
dprintk("path ");
@@ -2873,6 +2901,9 @@ out_eio:
dprintk(" status %d", status);
status = -EIO;
goto out;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2921,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = decode_pathname(xdr, &res->fs_path);
if (unlikely(status != 0))
goto out;
- READ_BUF(4);
- READ32(n);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
res->nlocations = 0;
@@ -2899,8 +2932,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
u32 m;
struct nfs4_fs_location *loc = &res->locations[res->nlocations];
- READ_BUF(4);
- READ32(m);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ m = be32_to_cpup(p);
loc->nservers = 0;
dprintk("%s: servers ", __func__);
@@ -2939,6 +2974,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
out_eio:
status = -EIO;
goto out;
@@ -2953,12 +2990,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
}
dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3012,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
- READ_BUF(4);
- READ32(*maxlink);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *maxlink = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
}
dprintk("%s: maxlink=%u\n", __func__, *maxlink);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3034,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
- READ_BUF(4);
- READ32(*maxname);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *maxname = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
}
dprintk("%s: maxname=%u\n", __func__, *maxname);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3057,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
uint64_t maxread;
- READ_BUF(8);
- READ64(maxread);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &maxread);
if (maxread > 0x7FFFFFFF)
maxread = 0x7FFFFFFF;
*res = (uint32_t)maxread;
@@ -3014,6 +3068,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
}
dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3083,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
uint64_t maxwrite;
- READ_BUF(8);
- READ64(maxwrite);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &maxwrite);
if (maxwrite > 0x7FFFFFFF)
maxwrite = 0x7FFFFFFF;
*res = (uint32_t)maxwrite;
@@ -3035,6 +3094,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
}
dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3109,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
- READ_BUF(4);
- READ32(tmp);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ tmp = be32_to_cpup(p);
*mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
ret = NFS_ATTR_FATTR_MODE;
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3133,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
- READ_BUF(4);
- READ32(*nlink);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *nlink = be32_to_cpup(p);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_client *clp, uint32_t *uid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3085,10 +3158,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
- if (len < XDR_MAX_NETOBJ) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (!may_sleep) {
+ /* do nothing */
+ } else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
ret = NFS_ATTR_FATTR_OWNER;
else
@@ -3101,9 +3180,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
}
dprintk("%s: uid=%d\n", __func__, (int)*uid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_client *clp, uint32_t *gid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3113,10 +3196,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
- if (len < XDR_MAX_NETOBJ) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (!may_sleep) {
+ /* do nothing */
+ } else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
ret = NFS_ATTR_FATTR_GROUP;
else
@@ -3129,6 +3218,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
}
dprintk("%s: gid=%d\n", __func__, (int)*gid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3235,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
dev_t tmp;
- READ_BUF(8);
- READ32(major);
- READ32(minor);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p);
tmp = MKDEV(major, minor);
if (MAJOR(tmp) == major && MINOR(tmp) == minor)
*rdev = tmp;
@@ -3154,6 +3248,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3262,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
}
dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3284,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
}
dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3306,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
}
dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3328,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
- READ_BUF(8);
- READ64(*used);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
ret = NFS_ATTR_FATTR_SPACE_USED;
}
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3349,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
uint64_t sec;
uint32_t nsec;
- READ_BUF(12);
- READ64(sec);
- READ32(nsec);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &sec);
+ nsec = be32_to_cpup(p);
time->tv_sec = (time_t)sec;
time->tv_nsec = (long)nsec;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3437,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
{
__be32 *p;
- READ_BUF(20);
- READ32(cinfo->atomic);
- READ64(cinfo->before);
- READ64(cinfo->after);
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+ cinfo->atomic = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &cinfo->before);
+ xdr_decode_hyper(p, &cinfo->after);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3458,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
status = decode_op_hdr(xdr, OP_ACCESS);
if (status)
return status;
- READ_BUF(8);
- READ32(supp);
- READ32(acc);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ supp = be32_to_cpup(p++);
+ acc = be32_to_cpup(p);
access->supported = supp;
access->access = acc;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
{
__be32 *p;
+
+ p = xdr_inline_decode(xdr, len);
+ if (likely(p)) {
+ memcpy(buf, p, len);
+ return 0;
+ }
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
int status;
status = decode_op_hdr(xdr, OP_CLOSE);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier, 8);
}
static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_COMMIT);
- if (status)
- return status;
- READ_BUF(8);
- COPYMEM(res->verf->verifier, 8);
- return 0;
+ if (!status)
+ status = decode_verifier(xdr, res->verf->verifier);
+ return status;
}
static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3527,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
return status;
if ((status = decode_change_info(xdr, cinfo)))
return status;
- READ_BUF(4);
- READ32(bmlen);
- READ_BUF(bmlen << 2);
- return 0;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3621,8 @@ xdr_error:
return status;
}
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ const struct nfs_server *server, int may_sleep)
{
__be32 *savep;
uint32_t attrlen,
@@ -3538,12 +3694,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+ status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+ &fattr->uid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+ status = decode_attr_group(xdr, bitmap, server->nfs_client,
+ &fattr->gid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -3633,14 +3791,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
if (status)
return status;
- READ_BUF(4);
- READ32(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
fh->size = len;
- READ_BUF(len);
- COPYMEM(fh->data, len);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ memcpy(fh->data, p, len);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3827,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
__be32 *p;
uint32_t namelen, type;
- READ_BUF(32);
- READ64(offset);
- READ64(length);
- READ32(type);
+ p = xdr_inline_decode(xdr, 32);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &offset);
+ p = xdr_decode_hyper(p, &length);
+ type = be32_to_cpup(p++);
if (fl != NULL) {
fl->fl_start = (loff_t)offset;
fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3843,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
fl->fl_type = F_RDLCK;
fl->fl_pid = 0;
}
- READ64(clientid);
- READ32(namelen);
- READ_BUF(namelen);
- return -NFS4ERR_DENIED;
+ p = xdr_decode_hyper(p, &clientid);
+ namelen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, namelen);
+ if (likely(p))
+ return -NFS4ERR_DENIED;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCK);
if (status == -EIO)
goto out;
if (status == 0) {
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+ status = decode_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ goto out;
} else if (status == -NFS4ERR_DENIED)
status = decode_lock_denied(xdr, NULL);
if (res->open_seqid != NULL)
@@ -3713,16 +3884,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCKU);
if (status != -EIO)
nfs_increment_lock_seqid(status, res->seqid);
- if (status == 0) {
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- }
+ if (status == 0)
+ status = decode_stateid(xdr, &res->stateid);
return status;
}
@@ -3737,34 +3905,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
__be32 *p;
uint32_t limit_type, nblocks, blocksize;
- READ_BUF(12);
- READ32(limit_type);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ limit_type = be32_to_cpup(p++);
switch (limit_type) {
case 1:
- READ64(*maxsize);
+ xdr_decode_hyper(p, maxsize);
break;
case 2:
- READ32(nblocks);
- READ32(blocksize);
+ nblocks = be32_to_cpup(p++);
+ blocksize = be32_to_cpup(p);
*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
{
__be32 *p;
uint32_t delegation_type;
+ int status;
- READ_BUF(4);
- READ32(delegation_type);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ delegation_type = be32_to_cpup(p);
if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
res->delegation_type = 0;
return 0;
}
- READ_BUF(NFS4_STATEID_SIZE+4);
- COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
- READ32(res->do_recall);
+ status = decode_stateid(xdr, &res->delegation);
+ if (unlikely(status))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->do_recall = be32_to_cpup(p);
switch (delegation_type) {
case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3956,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3970,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
status = decode_op_hdr(xdr, OP_OPEN);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ if (unlikely(status))
return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
decode_change_info(xdr, &res->cinfo);
- READ_BUF(8);
- READ32(res->rflags);
- READ32(bmlen);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->rflags = be32_to_cpup(p++);
+ bmlen = be32_to_cpup(p);
if (bmlen > 10)
goto xdr_error;
- READ_BUF(bmlen << 2);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (unlikely(!p))
+ goto out_overflow;
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
- READ32(res->attrset[i]);
+ res->attrset[i] = be32_to_cpup(p++);
for (; i < NFS4_BITMAP_SIZE; i++)
res->attrset[i] = 0;
@@ -3811,36 +3998,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
}
static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
}
static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4047,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
status = decode_op_hdr(xdr, OP_READ);
if (status)
return status;
- READ_BUF(8);
- READ32(eof);
- READ32(count);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ eof = be32_to_cpup(p++);
+ count = be32_to_cpup(p);
hdrlen = (u8 *) p - (u8 *) iov->iov_base;
recvd = req->rq_rcv_buf.len - hdrlen;
if (count > recvd) {
@@ -3878,6 +4064,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
res->eof = eof;
res->count = count;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4081,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
int status;
status = decode_op_hdr(xdr, OP_READDIR);
- if (status)
+ if (!status)
+ status = decode_verifier(xdr, readdir->verifier.data);
+ if (unlikely(status))
return status;
- READ_BUF(8);
- COPYMEM(readdir->verifier.data, 8);
dprintk("%s: verifier = %08x:%08x\n",
__func__,
((u32 *)readdir->verifier.data)[0],
((u32 *)readdir->verifier.data)[1]);
- hdrlen = (char *) p - (char *) iov->iov_base;
+ hdrlen = (char *) xdr->p - (char *) iov->iov_base;
recvd = rcvbuf->len - hdrlen;
if (pglen > recvd)
pglen = recvd;
@@ -3990,8 +4179,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
return status;
/* Convert length of symlink */
- READ_BUF(4);
- READ32(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
if (len >= rcvbuf->page_len || len <= 0) {
dprintk("nfs: server returned giant symlink!\n");
return -ENAMETOOLONG;
@@ -4015,6 +4206,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
kaddr[len+rcvbuf->page_base] = '\0';
kunmap_atomic(kaddr, KM_USER0);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4306,16 @@ static int decode_setattr(struct xdr_stream *xdr)
status = decode_op_hdr(xdr, OP_SETATTR);
if (status)
return status;
- READ_BUF(4);
- READ32(bmlen);
- READ_BUF(bmlen << 2);
- return 0;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4324,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
uint32_t opnum;
int32_t nfserr;
- READ_BUF(8);
- READ32(opnum);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
" %d\n", opnum);
return -EIO;
}
- READ32(nfserr);
+ nfserr = be32_to_cpup(p);
if (nfserr == NFS_OK) {
- READ_BUF(8 + NFS4_VERIFIER_SIZE);
- READ64(clp->cl_clientid);
- COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &clp->cl_clientid);
+ memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
uint32_t len;
/* skip netid string */
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
/* skip uaddr string */
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
return -NFSERR_CLID_INUSE;
} else
return nfs4_stat_to_errno(nfserr);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4384,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
if (status)
return status;
- READ_BUF(16);
- READ32(res->count);
- READ32(res->verf->committed);
- COPYMEM(res->verf->verifier, 8);
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->count = be32_to_cpup(p++);
+ res->verf->committed = be32_to_cpup(p++);
+ memcpy(res->verf->verifier, p, 8);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4407,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
{
__be32 *p;
uint32_t dummy;
+ char *dummy_str;
int status;
struct nfs_client *clp = res->client;
@@ -4194,36 +4415,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
if (status)
return status;
- READ_BUF(8);
- READ64(clp->cl_ex_clid);
- READ_BUF(12);
- READ32(clp->cl_seqid);
- READ32(clp->cl_exchange_flags);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &clp->cl_ex_clid);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ clp->cl_seqid = be32_to_cpup(p++);
+ clp->cl_exchange_flags = be32_to_cpup(p++);
/* We ask for SP4_NONE */
- READ32(dummy);
+ dummy = be32_to_cpup(p);
if (dummy != SP4_NONE)
return -EIO;
/* Throw away minor_id */
- READ_BUF(8);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
/* Throw away Major id */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
/* Throw away server_scope */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
/* Throw away Implementation id array */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4462,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
__be32 *p;
u32 nr_attrs;
- READ_BUF(28);
- READ32(attrs->headerpadsz);
- READ32(attrs->max_rqst_sz);
- READ32(attrs->max_resp_sz);
- READ32(attrs->max_resp_sz_cached);
- READ32(attrs->max_ops);
- READ32(attrs->max_reqs);
- READ32(nr_attrs);
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ goto out_overflow;
+ attrs->headerpadsz = be32_to_cpup(p++);
+ attrs->max_rqst_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz_cached = be32_to_cpup(p++);
+ attrs->max_ops = be32_to_cpup(p++);
+ attrs->max_reqs = be32_to_cpup(p++);
+ nr_attrs = be32_to_cpup(p);
if (unlikely(nr_attrs > 1)) {
printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
__func__, nr_attrs);
return -EINVAL;
}
- if (nr_attrs == 1)
- READ_BUF(4); /* skip rdma_attrs */
+ if (nr_attrs == 1) {
+ p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+ if (unlikely(!p))
+ goto out_overflow;
+ }
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+ return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
}
static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4502,26 @@ static int decode_create_session(struct xdr_stream *xdr,
struct nfs4_session *session = clp->cl_session;
status = decode_op_hdr(xdr, OP_CREATE_SESSION);
-
- if (status)
+ if (!status)
+ status = decode_sessionid(xdr, &session->sess_id);
+ if (unlikely(status))
return status;
- /* sessionid */
- READ_BUF(NFS4_MAX_SESSIONID_LEN);
- COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
-
/* seqid, flags */
- READ_BUF(8);
- READ32(clp->cl_seqid);
- READ32(session->flags);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ clp->cl_seqid = be32_to_cpup(p++);
+ session->flags = be32_to_cpup(p);
/* Channel attributes */
status = decode_chan_attrs(xdr, &session->fc_attrs);
if (!status)
status = decode_chan_attrs(xdr, &session->bc_attrs);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4545,9 @@ static int decode_sequence(struct xdr_stream *xdr,
return 0;
status = decode_op_hdr(xdr, OP_SEQUENCE);
- if (status)
+ if (!status)
+ status = decode_sessionid(xdr, &id);
+ if (unlikely(status))
goto out_err;
/*
@@ -4309,36 +4556,43 @@ static int decode_sequence(struct xdr_stream *xdr,
*/
status = -ESERVERFAULT;
- slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
- READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
- COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
if (memcmp(id.data, res->sr_session->sess_id.data,
NFS4_MAX_SESSIONID_LEN)) {
dprintk("%s Invalid session id\n", __func__);
goto out_err;
}
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+
/* seqid */
- READ32(dummy);
+ slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
+ dummy = be32_to_cpup(p++);
if (dummy != slot->seq_nr) {
dprintk("%s Invalid sequence number\n", __func__);
goto out_err;
}
/* slot id */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
if (dummy != res->sr_slotid) {
dprintk("%s Invalid slot id\n", __func__);
goto out_err;
}
/* highest slot id - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* target highest slot id - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* result flags - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p);
status = 0;
out_err:
res->sr_status = status;
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ status = -EIO;
+ goto out_err;
#else /* CONFIG_NFS_V4_1 */
return 0;
#endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4624,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
status = decode_open_downgrade(&xdr, res);
if (status != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4397,7 +4652,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
status = decode_access(&xdr, res);
if (status != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4424,7 +4680,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
goto out;
if ((status = decode_getfh(&xdr, res->fh)) != 0)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server
+ ,!RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4448,7 +4705,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
if ((status = decode_putrootfh(&xdr)) != 0)
goto out;
if ((status = decode_getfh(&xdr, res->fh)) == 0)
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4473,7 +4731,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
goto out;
if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
goto out;
- decode_getfattr(&xdr, &res->dir_attr, res->server);
+ decode_getfattr(&xdr, &res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4503,11 +4762,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
goto out;
/* Current FH is target directory */
- if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->new_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->old_fattr, res->server);
+ decode_getfattr(&xdr, res->old_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4540,11 +4801,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
* Note order: OP_LINK leaves the directory as the current
* filehandle.
*/
- if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4573,11 +4836,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
goto out;
if ((status = decode_getfh(&xdr, res->fh)) != 0)
goto out;
- if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->dir_fattr, res->server);
+ decode_getfattr(&xdr, res->dir_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4609,7 +4874,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
status = decode_putfh(&xdr);
if (status)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4716,7 +4982,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
* an ESTALE error. Shouldn't be a problem,
* though, since fattr->valid will remain unset.
*/
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4748,11 +5015,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
goto out;
if (decode_getfh(&xdr, &res->fh) != 0)
goto out;
- if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->f_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if (decode_restorefh(&xdr) != 0)
goto out;
- decode_getfattr(&xdr, res->dir_attr, res->server);
+ decode_getfattr(&xdr, res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4800,7 +5069,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
status = decode_open(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->f_attr, res->server);
+ decode_getfattr(&xdr, res->f_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4827,7 +5097,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
status = decode_setattr(&xdr);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5001,7 +5272,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
status = decode_write(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
if (!status)
status = res->count;
out:
@@ -5030,7 +5302,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
status = decode_commit(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5194,7 +5467,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
if (status != 0)
goto out;
status = decode_delegreturn(&xdr);
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5222,7 +5496,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
goto out;
xdr_enter_page(&xdr, PAGE_SIZE);
status = decode_getfattr(&xdr, &res->fs_locations->fattr,
- res->fs_locations->server);
+ res->fs_locations->server,
+ !RPC_IS_ASYNC(req->rq_task));
out:
return status;
}
@@ -5406,7 +5681,6 @@ static struct {
{ NFS4ERR_SERVERFAULT, -ESERVERFAULT },
{ NFS4ERR_BADTYPE, -EBADTYPE },
{ NFS4ERR_LOCKED, -EAGAIN },
- { NFS4ERR_RESOURCE, -EREMOTEIO },
{ NFS4ERR_SYMLINK, -ELOOP },
{ NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_DEADLOCK, -EDEADLK },
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..90be551b80c1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
Opt_lock, Opt_nolock,
- Opt_v2, Opt_v3,
+ Opt_v2, Opt_v3, Opt_v4,
Opt_udp, Opt_tcp, Opt_rdma,
Opt_acl, Opt_noacl,
Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_nolock, "nolock" },
{ Opt_v2, "v2" },
{ Opt_v3, "v3" },
+ { Opt_v4, "v4" },
{ Opt_udp, "udp" },
{ Opt_tcp, "tcp" },
{ Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mountvers, "mountvers=%s" },
{ Opt_nfsvers, "nfsvers=%s" },
{ Opt_nfsvers, "vers=%s" },
- { Opt_minorversion, "minorversion=%u" },
+ { Opt_minorversion, "minorversion=%s" },
{ Opt_sec, "sec=%s" },
{ Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
};
#ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+ struct nfs_parsed_mount_data *args, const char *dev_name);
+static int nfs4_try_mount(int flags, const char *dev_name,
+ struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -723,6 +728,29 @@ static void nfs_umount_begin(struct super_block *sb)
unlock_kernel();
}
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
+{
+ struct nfs_parsed_mount_data *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data) {
+ data->rsize = NFS_MAX_FILE_IO_SIZE;
+ data->wsize = NFS_MAX_FILE_IO_SIZE;
+ data->acregmin = NFS_DEF_ACREGMIN;
+ data->acregmax = NFS_DEF_ACREGMAX;
+ data->acdirmin = NFS_DEF_ACDIRMIN;
+ data->acdirmax = NFS_DEF_ACDIRMAX;
+ data->mount_server.port = NFS_UNSPEC_PORT;
+ data->nfs_server.port = NFS_UNSPEC_PORT;
+ data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ data->auth_flavors[0] = RPC_AUTH_UNIX;
+ data->auth_flavor_len = 1;
+ data->version = version;
+ data->minorversion = 0;
+ }
+ return data;
+}
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -742,127 +770,21 @@ static int nfs_verify_server_address(struct sockaddr *addr)
}
}
+ dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
return 0;
}
-static void nfs_parse_ipv4_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- u8 *addr = (u8 *)&sin->sin_addr.s_addr;
-
- if (str_len <= INET_ADDRSTRLEN) {
- dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
- (int)str_len, string);
-
- sin->sin_family = AF_INET;
- *addr_len = sizeof(*sin);
- if (in4_pton(string, str_len, addr, '\0', NULL))
- return;
- }
-
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
- const char *delim,
- struct sockaddr_in6 *sin6)
-{
- char *p;
- size_t len;
-
- if ((string + str_len) == delim)
- return 1;
-
- if (*delim != IPV6_SCOPE_DELIMITER)
- return 0;
-
- if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
- return 0;
-
- len = (string + str_len) - delim - 1;
- p = kstrndup(delim + 1, len, GFP_KERNEL);
- if (p) {
- unsigned long scope_id = 0;
- struct net_device *dev;
-
- dev = dev_get_by_name(&init_net, p);
- if (dev != NULL) {
- scope_id = dev->ifindex;
- dev_put(dev);
- } else {
- if (strict_strtoul(p, 10, &scope_id) == 0) {
- kfree(p);
- return 0;
- }
- }
-
- kfree(p);
-
- sin6->sin6_scope_id = scope_id;
- dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
- return 1;
- }
-
- return 0;
-}
-
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
- u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
- const char *delim;
-
- if (str_len <= INET6_ADDRSTRLEN) {
- dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
- (int)str_len, string);
-
- sin6->sin6_family = AF_INET6;
- *addr_len = sizeof(*sin6);
- if (in6_pton(string, str_len, addr,
- IPV6_SCOPE_DELIMITER, &delim) != 0) {
- if (nfs_parse_ipv6_scope_id(string, str_len,
- delim, sin6) != 0)
- return;
- }
- }
-
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-#else
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-#endif
-
/*
- * Construct a sockaddr based on the contents of a string that contains
- * an IP address in presentation format.
- *
- * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC.
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
*/
-void nfs_parse_ip_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
+static void nfs_set_port(struct sockaddr *sap, int *port,
+ const unsigned short default_port)
{
- unsigned int i, colons;
-
- colons = 0;
- for (i = 0; i < str_len; i++)
- if (string[i] == ':')
- colons++;
+ if (*port == NFS_UNSPEC_PORT)
+ *port = default_port;
- if (colons >= 2)
- nfs_parse_ipv6_address(string, str_len, sap, addr_len);
- else
- nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+ rpc_set_port(sap, *port);
}
/*
@@ -904,8 +826,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
/*
* Parse the value of the 'sec=' option.
- *
- * The flavor_len setting is for v4 mounts.
*/
static int nfs_parse_security_flavors(char *value,
struct nfs_parsed_mount_data *mnt)
@@ -916,53 +836,43 @@ static int nfs_parse_security_flavors(char *value,
switch (match_token(value, nfs_secflavor_tokens, args)) {
case Opt_sec_none:
- mnt->auth_flavor_len = 0;
mnt->auth_flavors[0] = RPC_AUTH_NULL;
break;
case Opt_sec_sys:
- mnt->auth_flavor_len = 0;
mnt->auth_flavors[0] = RPC_AUTH_UNIX;
break;
case Opt_sec_krb5:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
break;
case Opt_sec_krb5i:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
break;
case Opt_sec_krb5p:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
break;
case Opt_sec_lkey:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
break;
case Opt_sec_lkeyi:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
break;
case Opt_sec_lkeyp:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
break;
case Opt_sec_spkm:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
break;
case Opt_sec_spkmi:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
break;
case Opt_sec_spkmp:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
break;
default:
return 0;
}
+ mnt->auth_flavor_len = 1;
return 1;
}
@@ -1001,7 +911,6 @@ static int nfs_parse_mount_options(char *raw,
while ((p = strsep(&raw, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
unsigned long option;
- int int_option;
int token;
if (!*p)
@@ -1047,10 +956,18 @@ static int nfs_parse_mount_options(char *raw,
break;
case Opt_v2:
mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 2;
break;
case Opt_v3:
mnt->flags |= NFS_MOUNT_VER3;
+ mnt->version = 3;
+ break;
+#ifdef CONFIG_NFS_V4
+ case Opt_v4:
+ mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 4;
break;
+#endif
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1181,33 @@ static int nfs_parse_mount_options(char *raw,
switch (option) {
case NFS2_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 2;
break;
case NFS3_VERSION:
mnt->flags |= NFS_MOUNT_VER3;
+ mnt->version = 3;
+ break;
+#ifdef CONFIG_NFS_V4
+ case NFS4_VERSION:
+ mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 4;
break;
+#endif
default:
goto out_invalid_value;
}
break;
case Opt_minorversion:
- if (match_int(args, &int_option))
- return 0;
- if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
- return 0;
- mnt->minorversion = int_option;
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ rc = strict_strtoul(string, 10, &option);
+ kfree(string);
+ if (rc != 0)
+ goto out_invalid_value;
+ if (option > NFS4_MAX_MINOR_VERSION)
+ goto out_invalid_value;
+ mnt->minorversion = option;
break;
/*
@@ -1323,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw,
default:
dfprintk(MOUNT, "NFS: unrecognized "
"transport protocol\n");
+ kfree(string);
return 0;
}
break;
@@ -1352,11 +1283,14 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_ip_address(string, strlen(string),
- (struct sockaddr *)
- &mnt->nfs_server.address,
- &mnt->nfs_server.addrlen);
+ mnt->nfs_server.addrlen =
+ rpc_pton(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->nfs_server.address,
+ sizeof(mnt->nfs_server.address));
kfree(string);
+ if (mnt->nfs_server.addrlen == 0)
+ goto out_invalid_address;
break;
case Opt_clientaddr:
string = match_strdup(args);
@@ -1376,11 +1310,14 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_ip_address(string, strlen(string),
- (struct sockaddr *)
- &mnt->mount_server.address,
- &mnt->mount_server.addrlen);
+ mnt->mount_server.addrlen =
+ rpc_pton(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->mount_server.address,
+ sizeof(mnt->mount_server.address));
kfree(string);
+ if (mnt->mount_server.addrlen == 0)
+ goto out_invalid_address;
break;
case Opt_lookupcache:
string = match_strdup(args);
@@ -1432,8 +1369,11 @@ static int nfs_parse_mount_options(char *raw,
return 1;
+out_invalid_address:
+ printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+ return 0;
out_invalid_value:
- printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+ printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
return 0;
out_nomem:
printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1385,60 @@ out_security_failure:
}
/*
+ * Match the requested auth flavors with the list returned by
+ * the server. Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+ struct nfs_mount_request *request)
+{
+ unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+
+ /*
+ * Certain releases of Linux's mountd return an empty
+ * flavor list. To prevent behavioral regression with
+ * these servers (ie. rejecting mounts that used to
+ * succeed), revert to pre-2.6.32 behavior (no checking)
+ * if the returned flavor list is empty.
+ */
+ if (server_authlist_len == 0)
+ return 0;
+
+ /*
+ * We avoid sophisticated negotiating here, as there are
+ * plenty of cases where we can get it wrong, providing
+ * either too little or too much security.
+ *
+ * RFC 2623, section 2.7 suggests we SHOULD prefer the
+ * flavor listed first. However, some servers list
+ * AUTH_NULL first. Our caller plants AUTH_SYS, the
+ * preferred default, in args->auth_flavors[0] if user
+ * didn't specify sec= mount option.
+ */
+ for (i = 0; i < args->auth_flavor_len; i++)
+ for (j = 0; j < server_authlist_len; j++)
+ if (args->auth_flavors[i] == request->auth_flavs[j]) {
+ dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+ request->auth_flavs[j]);
+ args->auth_flavors[0] = request->auth_flavs[j];
+ return 0;
+ }
+
+ dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+ nfs_umount(request);
+ return -EACCES;
+}
+
+/*
* Use the remote server's MOUNT service to request the NFS file handle
* corresponding to the provided path.
*/
static int nfs_try_mount(struct nfs_parsed_mount_data *args,
struct nfs_fh *root_fh)
{
- unsigned int auth_flavor_len = 0;
+ rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+ unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
struct nfs_mount_request request = {
.sap = (struct sockaddr *)
&args->mount_server.address,
@@ -1459,15 +1446,19 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
.protocol = args->mount_server.protocol,
.fh = root_fh,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
- .auth_flav_len = &auth_flavor_len,
+ .auth_flav_len = &server_authlist_len,
+ .auth_flavs = server_authlist,
};
int status;
if (args->mount_server.version == 0) {
- if (args->flags & NFS_MOUNT_VER3)
- args->mount_server.version = NFS_MNT3_VERSION;
- else
- args->mount_server.version = NFS_MNT_VERSION;
+ switch (args->version) {
+ default:
+ args->mount_server.version = NFS_MNT3_VERSION;
+ break;
+ case 2:
+ args->mount_server.version = NFS_MNT_VERSION;
+ }
}
request.version = args->mount_server.version;
@@ -1485,23 +1476,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
args->mount_server.addrlen = args->nfs_server.addrlen;
}
request.salen = args->mount_server.addrlen;
-
- /*
- * autobind will be used if mount_server.port == 0
- */
- nfs_set_port(request.sap, args->mount_server.port);
+ nfs_set_port(request.sap, &args->mount_server.port, 0);
/*
* Now ask the mount server to map our export path
* to a file handle.
*/
status = nfs_mount(&request);
- if (status == 0)
- return 0;
+ if (status != 0) {
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+ request.hostname, status);
+ return status;
+ }
- dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
- request.hostname, status);
- return status;
+ /*
+ * MNTv1 (NFSv2) does not support auth flavor negotiation.
+ */
+ if (args->mount_server.version != NFS_MNT3_VERSION)
+ return 0;
+ return nfs_walk_authlist(args, &request);
}
static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,22 +1654,11 @@ static int nfs_validate_mount_data(void *options,
const char *dev_name)
{
struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
if (data == NULL)
goto out_no_data;
- args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
- args->rsize = NFS_MAX_FILE_IO_SIZE;
- args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->acregmin = NFS_DEF_ACREGMIN;
- args->acregmax = NFS_DEF_ACREGMAX;
- args->acdirmin = NFS_DEF_ACDIRMIN;
- args->acdirmax = NFS_DEF_ACDIRMAX;
- args->mount_server.port = 0; /* autobind unless user sets port */
- args->nfs_server.port = 0; /* autobind unless user sets port */
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- args->auth_flavors[0] = RPC_AUTH_UNIX;
-
switch (data->version) {
case 1:
data->namlen = 0;
@@ -1697,8 +1679,11 @@ static int nfs_validate_mount_data(void *options,
if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
goto out_invalid_fh;
mntfh->size = data->root.size;
- } else
+ args->version = 3;
+ } else {
mntfh->size = NFS2_FHSIZE;
+ args->version = 2;
+ }
memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,11 +1705,9 @@ static int nfs_validate_mount_data(void *options,
args->acdirmin = data->acdirmin;
args->acdirmax = data->acdirmax;
- memcpy(&args->nfs_server.address, &data->addr,
- sizeof(data->addr));
+ memcpy(sap, &data->addr, sizeof(data->addr));
args->nfs_server.addrlen = sizeof(data->addr);
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
if (!(data->flags & NFS_MOUNT_TCP))
@@ -1772,12 +1755,18 @@ static int nfs_validate_mount_data(void *options,
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
- nfs_set_port((struct sockaddr *)&args->nfs_server.address,
- args->nfs_server.port);
+ if (args->version == 4)
+#ifdef CONFIG_NFS_V4
+ return nfs4_validate_text_mount_data(options,
+ args, dev_name);
+#else
+ goto out_v4_not_compiled;
+#endif
+
+ nfs_set_port(sap, &args->nfs_server.port, 0);
nfs_set_mount_transport_protocol(args);
@@ -1800,7 +1789,7 @@ static int nfs_validate_mount_data(void *options,
}
#ifndef CONFIG_NFS_V3
- if (args->flags & NFS_MOUNT_VER3)
+ if (args->version == 3)
goto out_v3_not_compiled;
#endif /* !CONFIG_NFS_V3 */
@@ -1825,6 +1814,12 @@ out_v3_not_compiled:
return -EPROTONOSUPPORT;
#endif /* !CONFIG_NFS_V3 */
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+ dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+ return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
out_nomem:
dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
return -ENOMEM;
@@ -1852,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
data->acdirmin != nfss->acdirmin / HZ ||
data->acdirmax != nfss->acdirmax / HZ ||
data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.port != nfss->port ||
data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
- memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
- data->nfs_server.addrlen) != 0)
+ !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+ (struct sockaddr *)&nfss->nfs_client->cl_addr))
return -EINVAL;
return 0;
@@ -1897,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->acdirmin = nfss->acdirmin / HZ;
data->acdirmax = nfss->acdirmax / HZ;
data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.port = nfss->port;
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
@@ -1934,6 +1931,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
if (server->flags & NFS_MOUNT_NOAC)
sb->s_flags |= MS_SYNCHRONOUS;
+ sb->s_bdi = &server->backing_dev_info;
+
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
@@ -1950,7 +1949,7 @@ static void nfs_fill_super(struct super_block *sb,
if (data->bsize)
sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
- if (server->flags & NFS_MOUNT_VER3) {
+ if (server->nfs_client->rpc_ops->version == 3) {
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
@@ -1974,7 +1973,7 @@ static void nfs_clone_super(struct super_block *sb,
sb->s_blocksize = old_sb->s_blocksize;
sb->s_maxbytes = old_sb->s_maxbytes;
- if (server->flags & NFS_MOUNT_VER3) {
+ if (server->nfs_client->rpc_ops->version == 3) {
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
@@ -2108,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
};
int error = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = nfs_alloc_parsed_mount_data(3);
mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
if (data == NULL || mntfh == NULL)
goto out_free_fh;
@@ -2120,6 +2119,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (error < 0)
goto out;
+#ifdef CONFIG_NFS_V4
+ if (data->version == 4) {
+ error = nfs4_try_mount(flags, dev_name, data, mnt);
+ kfree(data->client_address);
+ goto out;
+ }
+#endif /* CONFIG_NFS_V4 */
+
/* Get a volume representation */
server = nfs_create_server(data, mntfh);
if (IS_ERR(server)) {
@@ -2150,7 +2157,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs_fill_super(s, data);
- nfs_fscache_get_super_cookie(s, data);
+ nfs_fscache_get_super_cookie(
+ s, data ? data->fscache_uniq : NULL, NULL);
}
mntroot = nfs_get_root(s, mntfh);
@@ -2196,8 +2204,8 @@ static void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
- bdi_unregister(&server->backing_dev_info);
kill_anon_super(s);
+ bdi_unregister(&server->backing_dev_info);
nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
}
@@ -2251,6 +2259,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
if (!s->s_root) {
/* initial superblock/root creation */
nfs_clone_super(s, data->sb);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs_get_root(s, data->fh);
@@ -2317,6 +2326,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
}
+static int nfs4_validate_text_mount_data(void *options,
+ struct nfs_parsed_mount_data *args,
+ const char *dev_name)
+{
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+ nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
+
+ nfs_validate_transport_protocol(args);
+
+ nfs4_validate_mount_flags(args);
+
+ if (args->version != 4) {
+ dfprintk(MOUNT,
+ "NFS4: Illegal mount version\n");
+ return -EINVAL;
+ }
+
+ if (args->auth_flavor_len > 1) {
+ dfprintk(MOUNT,
+ "NFS4: Too many RPC auth flavours specified\n");
+ return -EINVAL;
+ }
+
+ if (args->client_address == NULL) {
+ dfprintk(MOUNT,
+ "NFS4: mount program didn't pass callback address\n");
+ return -EINVAL;
+ }
+
+ return nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ NFS4_MAXNAMLEN,
+ &args->nfs_server.export_path,
+ NFS4_MAXPATHLEN);
+}
+
/*
* Validate NFSv4 mount options
*/
@@ -2324,36 +2370,23 @@ static int nfs4_validate_mount_data(void *options,
struct nfs_parsed_mount_data *args,
const char *dev_name)
{
- struct sockaddr_in *ap;
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
char *c;
if (data == NULL)
goto out_no_data;
- args->rsize = NFS_MAX_FILE_IO_SIZE;
- args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->acregmin = NFS_DEF_ACREGMIN;
- args->acregmax = NFS_DEF_ACREGMAX;
- args->acdirmin = NFS_DEF_ACDIRMIN;
- args->acdirmax = NFS_DEF_ACDIRMAX;
- args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- args->auth_flavor_len = 0;
- args->minorversion = 0;
-
switch (data->version) {
case 1:
- ap = (struct sockaddr_in *)&args->nfs_server.address;
if (data->host_addrlen > sizeof(args->nfs_server.address))
goto out_no_address;
if (data->host_addrlen == 0)
goto out_no_address;
args->nfs_server.addrlen = data->host_addrlen;
- if (copy_from_user(ap, data->host_addr, data->host_addrlen))
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
return -EFAULT;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
if (data->auth_flavourlen) {
@@ -2399,39 +2432,14 @@ static int nfs4_validate_mount_data(void *options,
nfs_validate_transport_protocol(args);
break;
- default: {
- int status;
-
+ default:
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
return -EINVAL;
- nfs_set_port((struct sockaddr *)&args->nfs_server.address,
- args->nfs_server.port);
-
- nfs_validate_transport_protocol(args);
-
- nfs4_validate_mount_flags(args);
-
- if (args->auth_flavor_len > 1)
- goto out_inval_auth;
-
- if (args->client_address == NULL)
- goto out_no_client_address;
-
- status = nfs_parse_devname(dev_name,
- &args->nfs_server.hostname,
- NFS4_MAXNAMLEN,
- &args->nfs_server.export_path,
- NFS4_MAXPATHLEN);
- if (status < 0)
- return status;
-
- break;
- }
+ return nfs4_validate_text_mount_data(options, args, dev_name);
}
return 0;
@@ -2448,10 +2456,6 @@ out_inval_auth:
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
-
-out_no_client_address:
- dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
- return -EINVAL;
}
/*
@@ -2507,7 +2511,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
- nfs_fscache_get_super_cookie(s, data);
+ nfs_fscache_get_super_cookie(
+ s, data ? data->fscache_uniq : NULL, NULL);
}
mntroot = nfs4_get_root(s, mntfh);
@@ -2618,6 +2623,34 @@ out_err:
return ret;
}
+static int nfs4_try_mount(int flags, const char *dev_name,
+ struct nfs_parsed_mount_data *data,
+ struct vfsmount *mnt)
+{
+ char *export_path;
+ struct vfsmount *root_mnt;
+ int error;
+
+ dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+ export_path = data->nfs_server.export_path;
+ data->nfs_server.export_path = "/";
+ root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+ data->nfs_server.hostname);
+ data->nfs_server.export_path = export_path;
+
+ error = PTR_ERR(root_mnt);
+ if (IS_ERR(root_mnt))
+ goto out;
+
+ error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+
+out:
+ dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+ error != 0 ? " [error]" : "");
+ return error;
+}
+
/*
* Get the superblock for an NFS4 mountpoint
*/
@@ -2625,11 +2658,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
struct nfs_parsed_mount_data *data;
- char *export_path;
- struct vfsmount *root_mnt;
int error = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = nfs_alloc_parsed_mount_data(4);
if (data == NULL)
goto out_free_data;
@@ -2638,17 +2669,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
if (error < 0)
goto out;
- export_path = data->nfs_server.export_path;
- data->nfs_server.export_path = "/";
- root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
- data->nfs_server.hostname);
- data->nfs_server.export_path = export_path;
-
- error = PTR_ERR(root_mnt);
- if (IS_ERR(root_mnt))
- goto out;
-
- error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+ error = nfs4_try_mount(flags, dev_name, data, mnt);
out:
kfree(data->client_address);
@@ -2669,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb)
dprintk("--> %s\n", __func__);
nfs_super_return_all_delegations(sb);
kill_anon_super(sb);
- nfs4_renewd_prepare_shutdown(server);
nfs_fscache_release_super_cookie(sb);
nfs_free_server(server);
dprintk("<-- %s\n", __func__);
@@ -2724,6 +2744,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_clone_super(s, data->sb);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs4_get_root(s, data->fh);
@@ -2805,6 +2826,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a34fae21fe10..590dc03b1dd3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/swap.h>
+#include <linux/migrate.h>
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
#include "internal.h"
#include "iostat.h"
#include "nfs4_fs.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -218,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
-/*
- * Find an associated nfs write request, and prepare to flush it out
- * May return an error if the user signalled nfs_wait_on_request().
- */
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page)
{
struct inode *inode = page->mapping->host;
struct nfs_page *req;
int ret;
spin_lock(&inode->i_lock);
- for(;;) {
+ for (;;) {
req = nfs_page_find_request_locked(page);
- if (req == NULL) {
- spin_unlock(&inode->i_lock);
- return 0;
- }
+ if (req == NULL)
+ break;
if (nfs_set_page_tag_locked(req))
break;
/* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = nfs_wait_on_request(req);
nfs_release_request(req);
if (ret != 0)
- return ret;
+ return ERR_PTR(ret);
spin_lock(&inode->i_lock);
}
- if (test_bit(PG_CLEAN, &req->wb_flags)) {
- spin_unlock(&inode->i_lock);
- BUG();
- }
- if (nfs_set_page_writeback(page) != 0) {
- spin_unlock(&inode->i_lock);
- BUG();
- }
spin_unlock(&inode->i_lock);
+ return req;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+ struct page *page)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ req = nfs_find_and_lock_request(page);
+ if (!req)
+ goto out;
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ ret = nfs_set_page_writeback(page);
+ BUG_ON(ret != 0);
+ BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+
if (!nfs_pageio_add_request(pgio, req)) {
nfs_redirty_request(req);
- return pgio->pg_error;
+ ret = pgio->pg_error;
}
- return 0;
+out:
+ return ret;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -762,7 +774,7 @@ int nfs_updatepage(struct file *file, struct page *page,
*/
if (nfs_write_pageuptodate(page, inode) &&
inode->i_flock == NULL &&
- !(file->f_flags & O_SYNC)) {
+ !(file->f_flags & O_DSYNC)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
}
@@ -1478,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .for_writepages = 1,
};
return __nfs_write_mapping(mapping, &wbc, how);
@@ -1580,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
}
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+ struct page *page)
+{
+ struct nfs_page *req;
+ int ret;
+
+ if (PageFsCache(page))
+ nfs_fscache_release_page(page, GFP_KERNEL);
+
+ req = nfs_find_and_lock_request(page);
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ ret = migrate_page(mapping, newpage, page);
+ if (!req)
+ goto out;
+ if (ret)
+ goto out_unlock;
+ page_cache_get(newpage);
+ req->wb_page = newpage;
+ SetPagePrivate(newpage);
+ set_page_private(newpage, page_private(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+out_unlock:
+ nfs_clear_page_tag_locked(req);
+ nfs_release_request(req);
+out:
+ return ret;
+}
+#endif
+
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
int flags = nfsexp_flags(rqstp, exp);
int ret;
+ validate_process_creds();
+
/* discard any old override before preparing the new set */
revert_creds(get_cred(current->real_cred));
new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
else
new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted);
+ validate_process_creds();
put_cred(override_creds(new));
put_cred(new);
+ validate_process_creds();
return 0;
oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..b73baba3fb97 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
+
static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
.hash_table = expkey_table,
.name = "nfsd.fh",
.cache_put = expkey_put,
- .cache_request = expkey_request,
+ .cache_upcall = expkey_upcall,
.cache_parse = expkey_parse,
.cache_show = expkey_show,
.match = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
+
static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
.hash_table = export_table,
.name = "nfsd.export",
.cache_put = svc_export_put,
- .cache_request = svc_export_request,
+ .cache_upcall = svc_export_upcall,
.cache_parse = svc_export_parse,
.cache_show = svc_export_show,
.match = svc_export_match,
@@ -1310,6 +1320,23 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
return exp;
}
+static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+{
+ struct svc_export *exp;
+ u32 fsidv[2];
+
+ mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+
+ exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
+ /*
+ * We shouldn't have accepting an nfsv4 request at all if we
+ * don't have a pseudoexport!:
+ */
+ if (IS_ERR(exp) && PTR_ERR(exp) == -ENOENT)
+ exp = ERR_PTR(-ESERVERFAULT);
+ return exp;
+}
+
/*
* Called when we need the filehandle for the root of the pseudofs,
* for a given NFSv4 client. The root is defined to be the
@@ -1320,17 +1347,16 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct svc_export *exp;
__be32 rv;
- u32 fsidv[2];
-
- mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
- exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
+ exp = find_fsidzero_export(rqstp);
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
if (rv)
goto out;
rv = check_nfsd_access(exp, rqstp);
+ if (rv)
+ fh_put(fhp);
out:
exp_put(exp);
return rv;
@@ -1505,7 +1531,7 @@ static int e_show(struct seq_file *m, void *p)
return svc_export_show(m, &svc_export_cache, cp);
}
-struct seq_operations nfs_exports_op = {
+const struct seq_operations nfs_exports_op = {
.start = e_start,
.next = e_next,
.stop = e_stop,
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index b2786a5f9afe..812bc64874f6 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -16,6 +16,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/nfsd/nfsd.h>
#include <linux/lockd/bind.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_LOCKD
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3219e84116..38c883d48b02 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -14,6 +14,7 @@
#include <linux/nfsd/xdr3.h>
#include <linux/posix_acl.h>
#include <linux/nfsacl.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
#define RETURN_STATUS(st) { resp->status = (st); return (st); }
@@ -217,6 +218,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
* XDR encode functions
*/
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
/* GETACL */
static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclres *resp)
@@ -308,7 +319,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
}
#define nfsaclsvc_decode_voidargs NULL
-#define nfsaclsvc_encode_voidres NULL
#define nfsaclsvc_release_void NULL
#define nfsd3_fhandleargs nfsd_fhandle
#define nfsd3_attrstatres nfsd_attrstat
@@ -346,5 +356,5 @@ struct svc_version nfsd_acl_version2 = {
.vs_proc = nfsd_acl_procedures2,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
- .vs_hidden = 1,
+ .vs_hidden = 0,
};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9981dbb377a3..526d85a65f76 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -13,6 +13,7 @@
#include <linux/nfsd/xdr3.h>
#include <linux/posix_acl.h>
#include <linux/nfsacl.h>
+#include "vfs.h"
#define RETURN_STATUS(st) { resp->status = (st); return (st); }
@@ -264,6 +265,6 @@ struct svc_version nfsd_acl_version3 = {
.vs_proc = nfsd_acl_procedures3,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
- .vs_hidden = 1,
+ .vs_hidden = 0,
};
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a713c418a922..1a259d313e14 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -25,6 +25,7 @@
#include <linux/nfsd/cache.h>
#include <linux/nfsd/xdr3.h>
#include <linux/nfs3.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 01d4ec1c88e0..d0a2ce1b4324 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
return p;
}
-static __be32 *
-encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
- struct svc_fh *fhp)
-{
- p = encode_post_op_attr(cd->rqstp, p, fhp);
- *p++ = xdr_one; /* yes, a file handle follows */
- p = encode_fh(p, fhp);
- fh_put(fhp);
- return p;
-}
-
static int
compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
const char *name, int namlen)
@@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
dparent = cd->fh.fh_dentry;
exp = cd->fh.fh_export;
- fh_init(fhp, NFS3_FHSIZE);
if (isdotent(name, namlen)) {
if (namlen == 2) {
dchild = dget_parent(dparent);
if (dchild == dparent) {
/* filesystem root - cannot return filehandle for ".." */
dput(dchild);
- return 1;
+ return -ENOENT;
}
} else
dchild = dget(dparent);
} else
dchild = lookup_one_len(name, dparent, namlen);
if (IS_ERR(dchild))
- return 1;
- if (d_mountpoint(dchild) ||
- fh_compose(fhp, exp, dchild, &cd->fh) != 0 ||
- !dchild->d_inode)
- rv = 1;
+ return -ENOENT;
+ rv = -ENOENT;
+ if (d_mountpoint(dchild))
+ goto out;
+ rv = fh_compose(fhp, exp, dchild, &cd->fh);
+ if (rv)
+ goto out;
+ if (!dchild->d_inode)
+ goto out;
+ rv = 0;
+out:
dput(dchild);
return rv;
}
+__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+{
+ struct svc_fh fh;
+ int err;
+
+ fh_init(&fh, NFS3_FHSIZE);
+ err = compose_entry_fh(cd, &fh, name, namlen);
+ if (err) {
+ *p++ = 0;
+ *p++ = 0;
+ goto out;
+ }
+ p = encode_post_op_attr(cd->rqstp, p, &fh);
+ *p++ = xdr_one; /* yes, a file handle follows */
+ p = encode_fh(p, &fh);
+out:
+ fh_put(&fh);
+ return p;
+}
+
/*
* Encode a directory entry. This one works for both normal readdir
* and readdirplus.
@@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p = encode_entry_baggage(cd, p, name, namlen, ino);
- /* throw in readdirplus baggage */
- if (plus) {
- struct svc_fh fh;
-
- if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
- *p++ = 0;
- *p++ = 0;
- } else
- p = encode_entryplus_baggage(cd, p, &fh);
- }
+ if (plus)
+ p = encode_entryplus_baggage(cd, p, name, namlen);
num_entry_words = p - cd->buffer;
} else if (cd->rqstp->rq_respages[pn+1] != NULL) {
/* temporarily encode entry into next page, then move back to
@@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
- /* throw in readdirplus baggage */
- if (plus) {
- struct svc_fh fh;
-
- if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
- /* zero out the filehandle */
- *p1++ = 0;
- *p1++ = 0;
- } else
- p1 = encode_entryplus_baggage(cd, p1, &fh);
- }
+ if (plus)
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 54b8b4140c8f..6d9c6aabc85e 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
deny = ~pas.group & pas.other;
if (deny) {
ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
- ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+ ace->flag = eflag;
ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_GROUP;
ace++;
@@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
if (deny) {
ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
- ace->access_mask = mask_from_posix(deny, flags);
+ ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
ace->who = pa->e_id;
ace++;
@@ -389,7 +389,7 @@ sort_pacl(struct posix_acl *pacl)
sort_pacl_range(pacl, 1, i-1);
BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
- j = i++;
+ j = ++i;
while (pacl->a_entries[j].e_tag == ACL_GROUP)
j++;
sort_pacl_range(pacl, i, j-1);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..24e8d78f8dde 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,25 +43,30 @@
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/state.h>
#include <linux/sunrpc/sched.h>
#include <linux/nfs4.h>
+#include <linux/sunrpc/xprtsock.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
+#define NFS4_STATEID_SIZE 16
/* Index of predefined Linux callback client operations */
enum {
- NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_SEQUENCE,
};
enum nfs_cb_opnum4 {
OP_CB_RECALL = 4,
+ OP_CB_SEQUENCE = 11,
};
#define NFS4_MAXTAGLEN 20
@@ -70,17 +75,29 @@ enum nfs_cb_opnum4 {
#define NFS4_dec_cb_null_sz 0
#define cb_compound_enc_hdr_sz 4
#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz (sessionid_sz + 4 + \
+ 1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
+
#define op_enc_sz 1
#define op_dec_sz 2
#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
1 + enc_stateid_sz + \
enc_nfs4_fh_sz)
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
op_dec_sz)
+struct nfs4_rpc_args {
+ void *args_op;
+ struct nfsd4_cb_sequence args_seq;
+};
+
/*
* Generic encode routines from fs/nfs/nfs4xdr.c
*/
@@ -137,11 +154,13 @@ xdr_error: \
} while (0)
struct nfs4_cb_compound_hdr {
- int status;
- u32 ident;
+ /* args */
+ u32 ident; /* minorversion 0 only */
u32 nops;
__be32 *nops_p;
u32 minorversion;
+ /* res */
+ int status;
u32 taglen;
char *tag;
};
@@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
hdr->nops++;
}
+static void
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ if (hdr->minorversion == 0)
+ return;
+
+ RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+
+ WRITE32(OP_CB_SEQUENCE);
+ WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ WRITE32(args->cbs_clp->cl_cb_seq_nr);
+ WRITE32(0); /* slotid, always 0 */
+ WRITE32(0); /* highest slotid always 0 */
+ WRITE32(0); /* cachethis always 0 */
+ WRITE32(0); /* FIXME: support referring_call_lists */
+ hdr->nops++;
+}
+
static int
nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
{
@@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
}
static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ struct nfs4_rpc_args *rpc_args)
{
struct xdr_stream xdr;
+ struct nfs4_delegation *args = rpc_args->args_op;
struct nfs4_cb_compound_hdr hdr = {
.ident = args->dl_ident,
+ .minorversion = rpc_args->args_seq.cbs_minorversion,
};
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_cb_compound_hdr(&xdr, &hdr);
+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
encode_cb_recall(&xdr, args, &hdr);
encode_cb_nops(&hdr);
return 0;
@@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
return 0;
}
+/*
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+ struct rpc_rqst *rqstp)
+{
+ struct nfs4_sessionid id;
+ int status;
+ u32 dummy;
+ __be32 *p;
+
+ if (res->cbs_minorversion == 0)
+ return 0;
+
+ status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
+ if (status)
+ return status;
+
+ /*
+ * If the server returns different values for sessionID, slotID or
+ * sequence number, the server is looney tunes.
+ */
+ status = -ESERVERFAULT;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+ memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
+ p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+ if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("%s Invalid session id\n", __func__);
+ goto out;
+ }
+ READ32(dummy);
+ if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+ dprintk("%s Invalid sequence number\n", __func__);
+ goto out;
+ }
+ READ32(dummy); /* slotid must be 0 */
+ if (dummy != 0) {
+ dprintk("%s Invalid slotid\n", __func__);
+ goto out;
+ }
+ /* FIXME: process highest slotid and target highest slotid */
+ status = 0;
+out:
+ return status;
+}
+
+
static int
nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
{
@@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
}
static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ struct nfsd4_cb_sequence *seq)
{
struct xdr_stream xdr;
struct nfs4_cb_compound_hdr hdr;
@@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
status = decode_cb_compound_hdr(&xdr, &hdr);
if (status)
goto out;
+ if (seq) {
+ status = decode_cb_sequence(&xdr, seq, rqstp);
+ if (status)
+ goto out;
+ }
status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
out:
return status;
@@ -377,16 +478,15 @@ static int max_cb_time(void)
int setup_callback_client(struct nfs4_client *clp)
{
- struct sockaddr_in addr;
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_timeout timeparms = {
.to_initval = max_cb_time(),
.to_retries = 0,
};
struct rpc_create_args args = {
- .protocol = IPPROTO_TCP,
- .address = (struct sockaddr *)&addr,
- .addrsize = sizeof(addr),
+ .protocol = XPRT_TRANSPORT_TCP,
+ .address = (struct sockaddr *) &cb->cb_addr,
+ .addrsize = cb->cb_addrlen,
.timeout = &timeparms,
.program = &cb_program,
.prognumber = cb->cb_prog,
@@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp)
if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
return -EINVAL;
-
- /* Initialize address */
- memset(&addr, 0, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_port = htons(cb->cb_port);
- addr.sin_addr.s_addr = htonl(cb->cb_addr);
-
+ if (cb->cb_minorversion) {
+ args.bc_xprt = clp->cl_cb_xprt;
+ args.protocol = XPRT_TRANSPORT_BC_TCP;
+ }
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
@@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
.rpc_call_done = nfsd4_cb_probe_done,
};
-static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
-{
- struct auth_cred acred = {
- .machine_cred = 1
- };
+static struct rpc_cred *callback_cred;
- /*
- * Note in the gss case this doesn't actually have to wait for a
- * gss upcall (or any calls to the client); this just creates a
- * non-uptodate cred which the rpc state machine will fill in with
- * a refresh_upcall later.
- */
- return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
- RPCAUTH_LOOKUP_NEW);
+int set_callback_cred(void)
+{
+ callback_cred = rpc_lookup_machine_cred();
+ if (!callback_cred)
+ return -ENOMEM;
+ return 0;
}
+
void do_probe_callback(struct nfs4_client *clp)
{
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
.rpc_argp = clp,
+ .rpc_cred = callback_cred
};
- struct rpc_cred *cred;
int status;
- cred = lookup_cb_cred(cb);
- if (IS_ERR(cred)) {
- status = PTR_ERR(cred);
- goto out;
- }
- cb->cb_cred = cred;
- msg.rpc_cred = cb->cb_cred;
status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
&nfsd4_cb_probe_ops, (void *)clp);
-out:
if (status) {
warn_no_callback_path(clp, status);
put_nfs4_client(clp);
@@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp)
do_probe_callback(clp);
}
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy. Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+ struct rpc_task *task)
+{
+ struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+ u32 *ptr = (u32 *)clp->cl_sessionid.data;
+ int status = 0;
+
+ dprintk("%s: %u:%u:%u:%u\n", __func__,
+ ptr[0], ptr[1], ptr[2], ptr[3]);
+
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+ dprintk("%s slot is busy\n", __func__);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * We'll need the clp during XDR encoding and decoding,
+ * and the sequence during decoding to verify the reply
+ */
+ args->args_seq.cbs_clp = clp;
+ task->tk_msg.rpc_resp = &args->args_seq;
+
+out:
+ dprintk("%s status=%d\n", __func__, status);
+ return status;
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegation *dp = calldata;
+ struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+ u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+ int status = 0;
+
+ args->args_seq.cbs_minorversion = minorversion;
+ if (minorversion) {
+ status = nfsd41_cb_setup_sequence(clp, task);
+ if (status) {
+ if (status != -EAGAIN) {
+ /* terminate rpc task */
+ task->tk_status = status;
+ task->tk_action = NULL;
+ }
+ return;
+ }
+ }
+ rpc_call_start(task);
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegation *dp = calldata;
+ struct nfs4_client *clp = dp->dl_client;
+
+ dprintk("%s: minorversion=%d\n", __func__,
+ clp->cl_cb_conn.cb_minorversion);
+
+ if (clp->cl_cb_conn.cb_minorversion) {
+ /* No need for lock, access serialized in nfsd4_cb_prepare */
+ ++clp->cl_cb_seq_nr;
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_wake_up_next(&clp->cl_cb_waitq);
+ dprintk("%s: freed slot, new seqid=%d\n", __func__,
+ clp->cl_cb_seq_nr);
+
+ /* We're done looking into the sequence information */
+ task->tk_msg.rpc_resp = NULL;
+ }
+}
+
static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfs4_delegation *dp = calldata;
struct nfs4_client *clp = dp->dl_client;
+ nfsd4_cb_done(task, calldata);
+
switch (task->tk_status) {
case -EIO:
/* Network partition? */
@@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
break;
default:
/* success, or error we can't handle */
- return;
+ goto done;
}
if (dp->dl_retries--) {
rpc_delay(task, 2*HZ);
task->tk_status = 0;
rpc_restart_call(task);
+ return;
} else {
atomic_set(&clp->cl_cb_conn.cb_set, 0);
warn_no_callback_path(clp, task->tk_status);
}
+done:
+ kfree(task->tk_msg.rpc_argp);
}
static void nfsd4_cb_recall_release(void *calldata)
@@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata)
}
static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+ .rpc_call_prepare = nfsd4_cb_prepare,
.rpc_call_done = nfsd4_cb_recall_done,
.rpc_release = nfsd4_cb_recall_release,
};
@@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_client;
struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
+ struct nfs4_rpc_args *args;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
- .rpc_argp = dp,
- .rpc_cred = clp->cl_cb_conn.cb_cred
+ .rpc_cred = callback_cred
};
- int status;
+ int status = -ENOMEM;
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ goto out;
+ args->args_op = dp;
+ msg.rpc_argp = args;
dp->dl_retries = 1;
status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
&nfsd4_cb_recall_ops, dp);
+out:
if (status) {
+ kfree(args);
put_nfs4_client(clp);
nfs4_put_delegation(dp);
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
@@ -146,6 +145,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+ return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+
+static int
idtoname_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +180,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
}
static void
-warn_no_idmapd(struct cache_detail *detail)
+warn_no_idmapd(struct cache_detail *detail, int has_died)
{
printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
- detail->last_close? "died" : "not been started");
+ has_died ? "died" : "not been started");
}
@@ -192,7 +197,7 @@ static struct cache_detail idtoname_cache = {
.hash_table = idtoname_table,
.name = "nfs4.idtoname",
.cache_put = ent_put,
- .cache_request = idtoname_request,
+ .cache_upcall = idtoname_upcall,
.cache_parse = idtoname_parse,
.cache_show = idtoname_show,
.warn_no_listener = warn_no_idmapd,
@@ -325,6 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+ return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+
+static int
nametoid_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +374,7 @@ static struct cache_detail nametoid_cache = {
.hash_table = nametoid_table,
.name = "nfs4.nametoid",
.cache_put = ent_put,
- .cache_request = nametoid_request,
+ .cache_upcall = nametoid_upcall,
.cache_parse = nametoid_parse,
.cache_show = nametoid_show,
.warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7c8801769a3c..60a93cdefef5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -48,6 +48,7 @@
#include <linux/nfsd/xdr4.h>
#include <linux/nfs4_acl.h>
#include <linux/sunrpc/gss_api.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -68,7 +69,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
u32 *bmval, u32 *writable)
{
struct dentry *dentry = cstate->current_fh.fh_dentry;
- struct svc_export *exp = cstate->current_fh.fh_export;
/*
* Check about attributes are supported by the NFSv4 server or not.
@@ -80,17 +80,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_attrnotsupp;
/*
- * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported
+ * Check FATTR4_WORD0_ACL can be supported
* in current environment or not.
*/
if (bmval[0] & FATTR4_WORD0_ACL) {
if (!IS_POSIXACL(dentry->d_inode))
return nfserr_attrnotsupp;
}
- if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
- if (exp->ex_fslocs.locations == NULL)
- return nfserr_attrnotsupp;
- }
/*
* According to spec, read-only attributes return ERR_INVAL.
@@ -123,6 +119,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp,
return status;
}
+static int
+is_create_with_attrs(struct nfsd4_open *open)
+{
+ return open->op_create == NFS4_OPEN_CREATE
+ && (open->op_createmode == NFS4_CREATE_UNCHECKED
+ || open->op_createmode == NFS4_CREATE_GUARDED
+ || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+}
+
+/*
+ * if error occurs when setting the acl, just clear the acl bit
+ * in the returned attr bitmap.
+ */
+static void
+do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl, u32 *bmval)
+{
+ __be32 status;
+
+ status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+ if (status)
+ /*
+ * We should probably fail the whole open at this point,
+ * but we've already created the file, so it's too late;
+ * So this seems the least of evils:
+ */
+ bmval[0] &= ~FATTR4_WORD0_ACL;
+}
+
static inline void
fh_dup2(struct svc_fh *dst, struct svc_fh *src)
{
@@ -206,6 +231,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
if (status)
goto out;
+ if (is_create_with_attrs(open) && open->op_acl != NULL)
+ do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
+
set_change_info(&open->op_cinfo, current_fh);
fh_dup2(current_fh, &resfh);
@@ -536,12 +564,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_badtype;
}
- if (!status) {
- fh_unlock(&cstate->current_fh);
- set_change_info(&create->cr_cinfo, &cstate->current_fh);
- fh_dup2(&cstate->current_fh, &resfh);
- }
+ if (status)
+ goto out;
+
+ if (create->cr_acl != NULL)
+ do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+ create->cr_bmval);
+ fh_unlock(&cstate->current_fh);
+ set_change_info(&create->cr_cinfo, &cstate->current_fh);
+ fh_dup2(&cstate->current_fh, &resfh);
+out:
fh_put(&resfh);
return status;
}
@@ -947,34 +980,6 @@ static struct nfsd4_operation nfsd4_ops[];
static const char *nfsd4_op_name(unsigned opnum);
/*
- * This is a replay of a compound for which no cache entry pages
- * were used. Encode the sequence operation, and if cachethis is FALSE
- * encode the uncache rep error on the next operation.
- */
-static __be32
-nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
- struct nfsd4_compoundres *resp)
-{
- struct nfsd4_op *op;
-
- dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
- resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
-
- /* Encode the replayed sequence operation */
- BUG_ON(resp->opcnt != 1);
- op = &args->ops[resp->opcnt - 1];
- nfsd4_encode_operation(resp, op);
-
- /*return nfserr_retry_uncached_rep in next operation. */
- if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
- op = &args->ops[resp->opcnt++];
- op->status = nfserr_retry_uncached_rep;
- nfsd4_encode_operation(resp, op);
- }
- return op->status;
-}
-
-/*
* Enforce NFSv4.1 COMPOUND ordering rules.
*
* TODO:
@@ -1083,13 +1088,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
BUG_ON(op->status == nfs_ok);
encode_op:
- /* Only from SEQUENCE or CREATE_SESSION */
+ /* Only from SEQUENCE */
if (resp->cstate.status == nfserr_replay_cache) {
dprintk("%s NFS4.1 replay from cache\n", __func__);
- if (nfsd4_not_cached(resp))
- status = nfsd4_enc_uncached_replay(args, resp);
- else
- status = op->status;
+ status = op->status;
goto out;
}
if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b5348405046b..c7a6b245c7ad 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -47,6 +47,7 @@
#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/mount.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..850960e5d626 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,8 @@
#include <linux/lockd/bind.h>
#include <linux/module.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -413,47 +415,77 @@ gen_sessionid(struct nfsd4_session *ses)
}
/*
- * Give the client the number of slots it requests bound by
- * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time. Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes. In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
+
+/*
+ * Give the client the number of ca_maxresponsesize_cached slots it
+ * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ * than NFSD_MAX_SLOTS_PER_SESSION.
*
- * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
- * should (up to a point) re-negotiate active sessions and reduce their
- * slot usage to make rooom for new connections. For now we just fail the
- * create session.
+ * If we run out of reserved DRC memory we should (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * rooom for new connections. For now we just fail the create session.
*/
-static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
{
- int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+ int mem, size = fchan->maxresp_cached;
if (fchan->maxreqs < 1)
return nfserr_inval;
- else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
- fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
- spin_lock(&nfsd_serv->sv_lock);
- if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
- np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
- nfsd_serv->sv_drc_pages_used += np;
- spin_unlock(&nfsd_serv->sv_lock);
+ if (size < NFSD_MIN_HDR_SEQ_SZ)
+ size = NFSD_MIN_HDR_SEQ_SZ;
+ size -= NFSD_MIN_HDR_SEQ_SZ;
+ if (size > NFSD_SLOT_CACHE_SIZE)
+ size = NFSD_SLOT_CACHE_SIZE;
+
+ /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
+ mem = fchan->maxreqs * size;
+ if (mem > NFSD_MAX_MEM_PER_SESSION) {
+ fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
+ if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+ fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+ mem = fchan->maxreqs * size;
+ }
- if (np <= 0) {
- status = nfserr_resource;
- fchan->maxreqs = 0;
- } else
- fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+ spin_lock(&nfsd_drc_lock);
+ /* bound the total session drc memory ussage */
+ if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
+ fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
+ mem = fchan->maxreqs * size;
+ }
+ nfsd_drc_mem_used += mem;
+ spin_unlock(&nfsd_drc_lock);
- return status;
+ if (fchan->maxreqs == 0)
+ return nfserr_serverfault;
+
+ fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
+ return 0;
}
/*
* fchan holds the client values on input, and the server values on output
+ * sv_max_mesg is the maximum payload plus one page for overhead.
*/
static int init_forechannel_attrs(struct svc_rqst *rqstp,
struct nfsd4_channel_attrs *session_fchan,
struct nfsd4_channel_attrs *fchan)
{
int status = 0;
- __u32 maxcount = svc_max_payload(rqstp);
+ __u32 maxcount = nfsd_serv->sv_max_mesg;
/* headerpadsz set to zero in encode routine */
@@ -466,36 +498,50 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
fchan->maxresp_sz = maxcount;
session_fchan->maxresp_sz = fchan->maxresp_sz;
- /* Set the max response cached size our default which is
- * a multiple of PAGE_SIZE and small */
- session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
- fchan->maxresp_cached = session_fchan->maxresp_cached;
-
/* Use the client's maxops if possible */
if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
session_fchan->maxops = fchan->maxops;
- /* try to use the client requested number of slots */
- if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
- fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-
/* FIXME: Error means no more DRC pages so the server should
* recover pages from existing sessions. For now fail session
* creation.
*/
- status = set_forechannel_maxreqs(fchan);
+ status = set_forechannel_drc_size(fchan);
+ session_fchan->maxresp_cached = fchan->maxresp_cached;
session_fchan->maxreqs = fchan->maxreqs;
+
+ dprintk("%s status %d\n", __func__, status);
return status;
}
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+ int i;
+
+ for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+ kfree(ses->se_slots[i]);
+}
+
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+ return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+
static int
alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
struct nfsd4_create_session *cses)
{
struct nfsd4_session *new, tmp;
- int idx, status = nfserr_resource, slotsize;
+ struct nfsd4_slot *sp;
+ int idx, slotsize, cachesize, i;
+ int status;
memset(&tmp, 0, sizeof(tmp));
@@ -506,14 +552,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
if (status)
goto out;
- /* allocate struct nfsd4_session and slot table in one piece */
- slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
+ BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+ + sizeof(struct nfsd4_session) > PAGE_SIZE);
+
+ status = nfserr_serverfault;
+ /* allocate struct nfsd4_session and slot table pointers in one piece */
+ slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
if (!new)
goto out;
memcpy(new, &tmp, sizeof(*new));
+ /* allocate each struct nfsd4_slot and data cache in one piece */
+ cachesize = slot_bytes(&new->se_fchannel);
+ for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+ sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+ if (!sp)
+ goto out_free;
+ new->se_slots[i] = sp;
+ }
+
new->se_client = clp;
gen_sessionid(new);
idx = hash_sessionid(&new->se_sessionid);
@@ -530,6 +589,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
status = nfs_ok;
out:
return status;
+out_free:
+ free_session_slots(new);
+ kfree(new);
+ goto out;
}
/* caller must hold sessionid_lock */
@@ -572,19 +635,18 @@ release_session(struct nfsd4_session *ses)
nfsd4_put_session(ses);
}
-static void nfsd4_release_respages(struct page **respages, short resused);
-
void
free_session(struct kref *kref)
{
struct nfsd4_session *ses;
- int i;
+ int mem;
ses = container_of(kref, struct nfsd4_session, se_ref);
- for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
- struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
- nfsd4_release_respages(e->ce_respages, e->ce_resused);
- }
+ spin_lock(&nfsd_drc_lock);
+ mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+ nfsd_drc_mem_used -= mem;
+ spin_unlock(&nfsd_drc_lock);
+ free_session_slots(ses);
kfree(ses);
}
@@ -647,18 +709,14 @@ shutdown_callback_client(struct nfs4_client *clp)
clp->cl_cb_conn.cb_client = NULL;
rpc_shutdown_client(clnt);
}
- if (clp->cl_cb_conn.cb_cred) {
- put_rpccred(clp->cl_cb_conn.cb_cred);
- clp->cl_cb_conn.cb_cred = NULL;
- }
}
static inline void
free_client(struct nfs4_client *clp)
{
shutdown_callback_client(clp);
- nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
- clp->cl_slot.sl_cache_entry.ce_resused);
+ if (clp->cl_cb_xprt)
+ svc_xprt_put(clp->cl_cb_xprt);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
kfree(clp->cl_principal);
@@ -714,25 +772,6 @@ expire_client(struct nfs4_client *clp)
put_nfs4_client(clp);
}
-static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
-{
- struct nfs4_client *clp;
-
- clp = alloc_client(name);
- if (clp == NULL)
- return NULL;
- memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
- atomic_set(&clp->cl_count, 1);
- atomic_set(&clp->cl_cb_conn.cb_set, 0);
- INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_strhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
- INIT_LIST_HEAD(&clp->cl_sessions);
- INIT_LIST_HEAD(&clp->cl_lru);
- return clp;
-}
-
static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
{
memcpy(target->cl_verifier.data, source->data,
@@ -795,6 +834,46 @@ static void gen_confirm(struct nfs4_client *clp)
*p++ = i++;
}
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+ struct svc_rqst *rqstp, nfs4_verifier *verf)
+{
+ struct nfs4_client *clp;
+ struct sockaddr *sa = svc_addr(rqstp);
+ char *princ;
+
+ clp = alloc_client(name);
+ if (clp == NULL)
+ return NULL;
+
+ princ = svc_gss_principal(rqstp);
+ if (princ) {
+ clp->cl_principal = kstrdup(princ, GFP_KERNEL);
+ if (clp->cl_principal == NULL) {
+ free_client(clp);
+ return NULL;
+ }
+ }
+
+ memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
+ atomic_set(&clp->cl_count, 1);
+ atomic_set(&clp->cl_cb_conn.cb_set, 0);
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
+ copy_verf(clp, verf);
+ rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
+ clp->cl_flavor = rqstp->rq_flavor;
+ copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+ gen_confirm(clp);
+
+ return clp;
+}
+
static int check_name(struct xdr_netobj name)
{
if (name.len == 0)
@@ -902,93 +981,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
return NULL;
}
-/* a helper function for parse_callback */
-static int
-parse_octet(unsigned int *lenp, char **addrp)
-{
- unsigned int len = *lenp;
- char *p = *addrp;
- int n = -1;
- char c;
-
- for (;;) {
- if (!len)
- break;
- len--;
- c = *p++;
- if (c == '.')
- break;
- if ((c < '0') || (c > '9')) {
- n = -1;
- break;
- }
- if (n < 0)
- n = 0;
- n = (n * 10) + (c - '0');
- if (n > 255) {
- n = -1;
- break;
- }
- }
- *lenp = len;
- *addrp = p;
- return n;
-}
-
-/* parse and set the setclientid ipv4 callback address */
-static int
-parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
-{
- int temp = 0;
- u32 cbaddr = 0;
- u16 cbport = 0;
- u32 addrlen = addr_len;
- char *addr = addr_val;
- int i, shift;
-
- /* ipaddress */
- shift = 24;
- for(i = 4; i > 0 ; i--) {
- if ((temp = parse_octet(&addrlen, &addr)) < 0) {
- return 0;
- }
- cbaddr |= (temp << shift);
- if (shift > 0)
- shift -= 8;
- }
- *cbaddrp = cbaddr;
-
- /* port */
- shift = 8;
- for(i = 2; i > 0 ; i--) {
- if ((temp = parse_octet(&addrlen, &addr)) < 0) {
- return 0;
- }
- cbport |= (temp << shift);
- if (shift > 0)
- shift -= 8;
- }
- *cbportp = cbport;
- return 1;
-}
-
static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
{
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
-
- /* Currently, we only support tcp for the callback channel */
- if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
+ unsigned short expected_family;
+
+ /* Currently, we only support tcp and tcp6 for the callback channel */
+ if (se->se_callback_netid_len == 3 &&
+ !memcmp(se->se_callback_netid_val, "tcp", 3))
+ expected_family = AF_INET;
+ else if (se->se_callback_netid_len == 4 &&
+ !memcmp(se->se_callback_netid_val, "tcp6", 4))
+ expected_family = AF_INET6;
+ else
goto out_err;
- if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
- &cb->cb_addr, &cb->cb_port)))
+ cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+ se->se_callback_addr_len,
+ (struct sockaddr *) &cb->cb_addr,
+ sizeof(cb->cb_addr));
+
+ if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
goto out_err;
+
+ if (cb->cb_addr.ss_family == AF_INET6)
+ ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+
cb->cb_minorversion = 0;
cb->cb_prog = se->se_callback_prog;
cb->cb_ident = se->se_callback_ident;
return;
out_err:
+ cb->cb_addr.ss_family = AF_UNSPEC;
+ cb->cb_addrlen = 0;
dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
"will not receive delegations\n",
clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -996,175 +1022,87 @@ out_err:
return;
}
-void
-nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
-
- resp->cstate.statp = statp;
-}
-
/*
- * Dereference the result pages.
+ * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
*/
-static void
-nfsd4_release_respages(struct page **respages, short resused)
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
- int i;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ unsigned int base;
- dprintk("--> %s\n", __func__);
- for (i = 0; i < resused; i++) {
- if (!respages[i])
- continue;
- put_page(respages[i]);
- respages[i] = NULL;
- }
-}
+ dprintk("--> %s slot %p\n", __func__, slot);
-static void
-nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
-{
- int i;
+ slot->sl_opcnt = resp->opcnt;
+ slot->sl_status = resp->cstate.status;
- for (i = 0; i < count; i++) {
- topages[i] = frompages[i];
- if (!topages[i])
- continue;
- get_page(topages[i]);
+ if (nfsd4_not_cached(resp)) {
+ slot->sl_datalen = 0;
+ return;
}
+ slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
+ base = (char *)resp->cstate.datap -
+ (char *)resp->xbuf->head[0].iov_base;
+ if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
+ slot->sl_datalen))
+ WARN("%s: sessions DRC could not cache compound\n", __func__);
+ return;
}
/*
- * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
- * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
- * length of the XDR response is less than se_fmaxresp_cached
- * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
- * of the reply (e.g. readdir).
+ * Encode the replay sequence operation from the slot values.
+ * If cachethis is FALSE encode the uncached rep error on the next
+ * operation which sets resp->p and increments resp->opcnt for
+ * nfs4svc_encode_compoundres.
*
- * Store the base and length of the rq_req.head[0] page
- * of the NFSv4.1 data, just past the rpc header.
*/
-void
-nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+static __be32
+nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
+ struct nfsd4_compoundres *resp)
{
- struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
- struct svc_rqst *rqstp = resp->rqstp;
- struct nfsd4_compoundargs *args = rqstp->rq_argp;
- struct nfsd4_op *op = &args->ops[resp->opcnt];
- struct kvec *resv = &rqstp->rq_res.head[0];
-
- dprintk("--> %s entry %p\n", __func__, entry);
+ struct nfsd4_op *op;
+ struct nfsd4_slot *slot = resp->cstate.slot;
- /* Don't cache a failed OP_SEQUENCE. */
- if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
- return;
+ dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
+ resp->opcnt, resp->cstate.slot->sl_cachethis);
- nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
- entry->ce_opcnt = resp->opcnt;
- entry->ce_status = resp->cstate.status;
+ /* Encode the replayed sequence operation */
+ op = &args->ops[resp->opcnt - 1];
+ nfsd4_encode_operation(resp, op);
- /*
- * Don't need a page to cache just the sequence operation - the slot
- * does this for us!
- */
-
- if (nfsd4_not_cached(resp)) {
- entry->ce_resused = 0;
- entry->ce_rpchdrlen = 0;
- dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
- resp->cstate.slot->sl_cache_entry.ce_cachethis);
- return;
- }
- entry->ce_resused = rqstp->rq_resused;
- if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
- entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
- nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
- entry->ce_resused);
- entry->ce_datav.iov_base = resp->cstate.statp;
- entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]));
- /* Current request rpc header length*/
- entry->ce_rpchdrlen = (char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]);
-}
-
-/*
- * We keep the rpc header, but take the nfs reply from the replycache.
- */
-static int
-nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
- struct nfsd4_cache_entry *entry)
-{
- struct svc_rqst *rqstp = resp->rqstp;
- struct kvec *resv = &resp->rqstp->rq_res.head[0];
- int len;
-
- /* Current request rpc header length*/
- len = (char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]);
- if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
- dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
- entry->ce_datav.iov_len);
- return 0;
+ /* Return nfserr_retry_uncached_rep in next operation. */
+ if (args->opcnt > 1 && slot->sl_cachethis == 0) {
+ op = &args->ops[resp->opcnt++];
+ op->status = nfserr_retry_uncached_rep;
+ nfsd4_encode_operation(resp, op);
}
- /* copy the cached reply nfsd data past the current rpc header */
- memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
- entry->ce_datav.iov_len);
- resv->iov_len = len + entry->ce_datav.iov_len;
- return 1;
+ return op->status;
}
/*
- * Keep the first page of the replay. Copy the NFSv4.1 data from the first
- * cached page. Replace any futher replay pages from the cache.
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
*/
__be32
nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
- struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+ struct nfsd4_slot *slot = resp->cstate.slot;
__be32 status;
- dprintk("--> %s entry %p\n", __func__, entry);
-
- /*
- * If this is just the sequence operation, we did not keep
- * a page in the cache entry because we can just use the
- * slot info stored in struct nfsd4_sequence that was checked
- * against the slot in nfsd4_sequence().
- *
- * This occurs when seq->cachethis is FALSE, or when the client
- * session inactivity timer fires and a solo sequence operation
- * is sent (lease renewal).
- */
- if (seq && nfsd4_not_cached(resp)) {
- seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
- return nfs_ok;
- }
+ dprintk("--> %s slot %p\n", __func__, slot);
- if (!nfsd41_copy_replay_data(resp, entry)) {
- /*
- * Not enough room to use the replay rpc header, send the
- * cached header. Release all the allocated result pages.
- */
- svc_free_res_pages(resp->rqstp);
- nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
- entry->ce_resused);
- } else {
- /* Release all but the first allocated result page */
+ /* Either returns 0 or nfserr_retry_uncached */
+ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
+ if (status == nfserr_retry_uncached_rep)
+ return status;
- resp->rqstp->rq_resused--;
- svc_free_res_pages(resp->rqstp);
+ /* The sequence operation has been encoded, cstate->datap set. */
+ memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
- nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
- &entry->ce_respages[1],
- entry->ce_resused - 1);
- }
-
- resp->rqstp->rq_resused = entry->ce_resused;
- resp->opcnt = entry->ce_opcnt;
- resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
- status = entry->ce_status;
+ resp->opcnt = slot->sl_opcnt;
+ resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
+ status = slot->sl_status;
return status;
}
@@ -1194,13 +1132,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
int status;
unsigned int strhashval;
char dname[HEXDIR_LEN];
+ char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
- u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+ struct sockaddr *sa = svc_addr(rqstp);
+ rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
- " ip_addr=%u flags %x, spa_how %d\n",
+ "ip_addr=%s flags %x, spa_how %d\n",
__func__, rqstp, exid, exid->clname.len, exid->clname.data,
- ip_addr, exid->flags, exid->spa_how);
+ addr_str, exid->flags, exid->spa_how);
if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
return nfserr_inval;
@@ -1281,28 +1221,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
out_new:
/* Normal case */
- new = create_client(exid->clname, dname);
+ new = create_client(exid->clname, dname, rqstp, &verf);
if (new == NULL) {
- status = nfserr_resource;
+ status = nfserr_serverfault;
goto out;
}
- copy_verf(new, &verf);
- copy_cred(&new->cl_cred, &rqstp->rq_cred);
- new->cl_addr = ip_addr;
gen_clid(new);
- gen_confirm(new);
add_to_unconfirmed(new, strhashval);
out_copy:
exid->clientid.cl_boot = new->cl_clientid.cl_boot;
exid->clientid.cl_id = new->cl_clientid.cl_id;
- new->cl_slot.sl_seqid = 0;
exid->seqid = 1;
nfsd4_set_ex_flags(new, exid);
dprintk("nfsd4_exchange_id seqid %d flags %x\n",
- new->cl_slot.sl_seqid, new->cl_exchange_flags);
+ new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
status = nfs_ok;
out:
@@ -1313,40 +1248,60 @@ error:
}
static int
-check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
{
- dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
- slot->sl_seqid);
+ dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
+ slot_seqid);
/* The slot is in use, and no response has been sent. */
- if (slot->sl_inuse) {
- if (seqid == slot->sl_seqid)
+ if (slot_inuse) {
+ if (seqid == slot_seqid)
return nfserr_jukebox;
else
return nfserr_seq_misordered;
}
/* Normal */
- if (likely(seqid == slot->sl_seqid + 1))
+ if (likely(seqid == slot_seqid + 1))
return nfs_ok;
/* Replay */
- if (seqid == slot->sl_seqid)
+ if (seqid == slot_seqid)
return nfserr_replay_cache;
/* Wraparound */
- if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+ if (seqid == 1 && (slot_seqid + 1) == 0)
return nfs_ok;
/* Misordered replay or misordered new request */
return nfserr_seq_misordered;
}
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot, int nfserr)
+{
+ slot->sl_status = nfserr;
+ memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot)
+{
+ memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+ return slot->sl_status;
+}
+
__be32
nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_create_session *cr_ses)
{
- u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
- struct nfsd4_slot *slot = NULL;
+ struct nfsd4_clid_slot *cs_slot = NULL;
int status = 0;
nfs4_lock_state();
@@ -1354,40 +1309,38 @@ nfsd4_create_session(struct svc_rqst *rqstp,
conf = find_confirmed_client(&cr_ses->clientid);
if (conf) {
- slot = &conf->cl_slot;
- status = check_slot_seqid(cr_ses->seqid, slot);
+ cs_slot = &conf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
dprintk("Got a create_session replay! seqid= %d\n",
- slot->sl_seqid);
- cstate->slot = slot;
- cstate->status = status;
+ cs_slot->sl_seqid);
/* Return the cached reply status */
- status = nfsd4_replay_cache_entry(resp, NULL);
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
goto out;
- } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+ } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
status = nfserr_seq_misordered;
dprintk("Sequence misordered!\n");
dprintk("Expected seqid= %d but got seqid= %d\n",
- slot->sl_seqid, cr_ses->seqid);
+ cs_slot->sl_seqid, cr_ses->seqid);
goto out;
}
- conf->cl_slot.sl_seqid++;
+ cs_slot->sl_seqid++;
} else if (unconf) {
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
- (ip_addr != unconf->cl_addr)) {
+ !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
status = nfserr_clid_inuse;
goto out;
}
- slot = &unconf->cl_slot;
- status = check_slot_seqid(cr_ses->seqid, slot);
+ cs_slot = &unconf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
/* an unconfirmed replay returns misordered */
status = nfserr_seq_misordered;
- goto out;
+ goto out_cache;
}
- slot->sl_seqid++; /* from 0 to 1 */
+ cs_slot->sl_seqid++; /* from 0 to 1 */
move_to_confirmed(unconf);
/*
@@ -1396,6 +1349,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cr_ses->flags &= ~SESSION4_PERSIST;
cr_ses->flags &= ~SESSION4_RDMA;
+ if (cr_ses->flags & SESSION4_BACK_CHAN) {
+ unconf->cl_cb_xprt = rqstp->rq_xprt;
+ svc_xprt_get(unconf->cl_cb_xprt);
+ rpc_copy_addr(
+ (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
+ sa);
+ unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+ unconf->cl_cb_conn.cb_minorversion =
+ cstate->minorversion;
+ unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
+ unconf->cl_cb_seq_nr = 1;
+ nfsd4_probe_callback(unconf);
+ }
conf = unconf;
} else {
status = nfserr_stale_clientid;
@@ -1408,12 +1374,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
- cr_ses->seqid = slot->sl_seqid;
+ cr_ses->seqid = cs_slot->sl_seqid;
- slot->sl_inuse = true;
- cstate->slot = slot;
- /* Ensure a page is used for the cache */
- slot->sl_cache_entry.ce_cachethis = 1;
+out_cache:
+ /* cache solo and embedded create sessions under the state lock */
+ nfsd4_cache_create_session(cr_ses, cs_slot, status);
out:
nfs4_unlock_state();
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1478,18 +1443,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (seq->slotid >= session->se_fchannel.maxreqs)
goto out;
- slot = &session->se_slots[seq->slotid];
+ slot = session->se_slots[seq->slotid];
dprintk("%s: slotid %d\n", __func__, seq->slotid);
- status = check_slot_seqid(seq->seqid, slot);
+ /* We do not negotiate the number of slots yet, so set the
+ * maxslots to the session maxreqs which is used to encode
+ * sr_highest_slotid and the sr_target_slot id to maxslots */
+ seq->maxslots = session->se_fchannel.maxreqs;
+
+ status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
if (status == nfserr_replay_cache) {
cstate->slot = slot;
cstate->session = session;
/* Return the cached reply status and set cstate->status
- * for nfsd4_svc_encode_compoundres processing */
+ * for nfsd4_proc_compound processing */
status = nfsd4_replay_cache_entry(resp, seq);
cstate->status = nfserr_replay_cache;
- goto replay_cache;
+ goto out;
}
if (status)
goto out;
@@ -1497,23 +1467,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
/* Success! bump slot seqid */
slot->sl_inuse = true;
slot->sl_seqid = seq->seqid;
- slot->sl_cache_entry.ce_cachethis = seq->cachethis;
- /* Always set the cache entry cachethis for solo sequence */
- if (nfsd4_is_solo_sequence(resp))
- slot->sl_cache_entry.ce_cachethis = 1;
+ slot->sl_cachethis = seq->cachethis;
cstate->slot = slot;
cstate->session = session;
-replay_cache:
- /* Renew the clientid on success and on replay.
- * Hold a session reference until done processing the compound:
+ /* Hold a session reference until done processing the compound:
* nfsd4_put_session called only if the cstate slot is set.
*/
- renew_client(session->se_client);
nfsd4_get_session(session);
out:
spin_unlock(&sessionid_lock);
+ /* Renew the clientid on success and on replay */
+ if (cstate->session) {
+ nfs4_lock_state();
+ renew_client(session->se_client);
+ nfs4_unlock_state();
+ }
dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
}
@@ -1522,7 +1492,7 @@ __be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
- struct sockaddr_in *sin = svc_addr_in(rqstp);
+ struct sockaddr *sa = svc_addr(rqstp);
struct xdr_netobj clname = {
.len = setclid->se_namelen,
.data = setclid->se_name,
@@ -1531,7 +1501,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
- char *princ;
char dname[HEXDIR_LEN];
if (!check_name(clname))
@@ -1554,8 +1523,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* RFC 3530 14.2.33 CASE 0: */
status = nfserr_clid_inuse;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
- dprintk("NFSD: setclientid: string in use by client"
- " at %pI4\n", &conf->cl_addr);
+ char addr_str[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
+ sizeof(addr_str));
+ dprintk("NFSD: setclientid: string in use by client "
+ "at %s\n", addr_str);
goto out;
}
}
@@ -1573,7 +1545,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
if (unconf)
expire_client(unconf);
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
@@ -1590,7 +1562,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
expire_client(unconf);
}
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
copy_clid(new, conf);
@@ -1600,7 +1572,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* probable client reboot; state will be removed if
* confirmed.
*/
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
@@ -1611,25 +1583,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* confirmed.
*/
expire_client(unconf);
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
}
- copy_verf(new, &clverifier);
- new->cl_addr = sin->sin_addr.s_addr;
- new->cl_flavor = rqstp->rq_flavor;
- princ = svc_gss_principal(rqstp);
- if (princ) {
- new->cl_principal = kstrdup(princ, GFP_KERNEL);
- if (new->cl_principal == NULL) {
- free_client(new);
- goto out;
- }
- }
- copy_cred(&new->cl_cred, &rqstp->rq_cred);
- gen_confirm(new);
- gen_callback(new, setclid);
+ gen_callback(new, setclid, rpc_get_scope_id(sa));
add_to_unconfirmed(new, strhashval);
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1651,7 +1610,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid_confirm *setclientid_confirm)
{
- struct sockaddr_in *sin = svc_addr_in(rqstp);
+ struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1670,9 +1629,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
unconf = find_unconfirmed_client(clid);
status = nfserr_clid_inuse;
- if (conf && conf->cl_addr != sin->sin_addr.s_addr)
+ if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
goto out;
- if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
+ if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
goto out;
/*
@@ -2163,7 +2122,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
return -EAGAIN;
}
-static struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
.fl_break = nfsd_break_deleg_cb,
.fl_release_private = nfsd_release_deleg_cb,
.fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -2458,11 +2417,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
- dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n",
- dp->dl_stateid.si_boot,
- dp->dl_stateid.si_stateownerid,
- dp->dl_stateid.si_fileid,
- dp->dl_stateid.si_generation);
+ dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
+ STATEID_VAL(&dp->dl_stateid));
out:
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
&& flag == NFS4_OPEN_DELEGATE_NONE
@@ -2552,9 +2508,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs_ok;
- dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
- stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
- stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
+ dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
+ STATEID_VAL(&stp->st_stateid));
out:
if (fp)
put_nfs4_file(fp);
@@ -2720,9 +2675,8 @@ STALE_STATEID(stateid_t *stateid)
{
if (time_after((unsigned long)boot_time,
(unsigned long)stateid->si_boot)) {
- dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
- stateid->si_boot, stateid->si_stateownerid,
- stateid->si_fileid, stateid->si_generation);
+ dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
+ STATEID_VAL(stateid));
return 1;
}
return 0;
@@ -2734,9 +2688,8 @@ EXPIRED_STATEID(stateid_t *stateid)
if (time_before((unsigned long)boot_time,
((unsigned long)stateid->si_boot)) &&
time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
- dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
- stateid->si_boot, stateid->si_stateownerid,
- stateid->si_fileid, stateid->si_generation);
+ dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
+ STATEID_VAL(stateid));
return 1;
}
return 0;
@@ -2750,9 +2703,8 @@ stateid_error_map(stateid_t *stateid)
if (EXPIRED_STATEID(stateid))
return nfserr_expired;
- dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
- stateid->si_boot, stateid->si_stateownerid,
- stateid->si_fileid, stateid->si_generation);
+ dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
+ STATEID_VAL(stateid));
return nfserr_bad_stateid;
}
@@ -2938,10 +2890,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
struct svc_fh *current_fh = &cstate->current_fh;
__be32 status;
- dprintk("NFSD: preprocess_seqid_op: seqid=%d "
- "stateid = (%08x/%08x/%08x/%08x)\n", seqid,
- stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
- stateid->si_generation);
+ dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
+ seqid, STATEID_VAL(stateid));
*stpp = NULL;
*sopp = NULL;
@@ -3073,12 +3023,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
sop->so_confirmed = 1;
update_stateid(&stp->st_stateid);
memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
- dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d "
- "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid,
- stp->st_stateid.si_boot,
- stp->st_stateid.si_stateownerid,
- stp->st_stateid.si_fileid,
- stp->st_stateid.si_generation);
+ dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
+ __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
nfsd4_create_clid_dir(sop->so_client);
out:
@@ -3337,9 +3283,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid)
struct nfs4_file *fp;
struct nfs4_delegation *dl;
- dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n",
- stid->si_boot, stid->si_stateownerid,
- stid->si_fileid, stid->si_generation);
+ dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
+ STATEID_VAL(stid));
fp = find_file(ino);
if (!fp)
@@ -3368,7 +3313,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
/* Hack!: For now, we're defining this just so we can use a pointer to it
* as a unique cookie to identify our (NFSv4's) posix locks. */
-static struct lock_manager_operations nfsd_posix_mng_ops = {
+static const struct lock_manager_operations nfsd_posix_mng_ops = {
};
static inline void
@@ -4072,7 +4017,7 @@ set_max_delegations(void)
/* initialization to perform when the nfsd service is started: */
-static void
+static int
__nfs4_state_start(void)
{
unsigned long grace_time;
@@ -4084,19 +4029,26 @@ __nfs4_state_start(void)
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
+ if (laundry_wq == NULL)
+ return -ENOMEM;
queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
set_max_delegations();
+ return set_callback_cred();
}
-void
+int
nfs4_state_start(void)
{
+ int ret;
+
if (nfs4_init)
- return;
+ return 0;
nfsd4_load_reboot_recovery_data();
- __nfs4_state_start();
+ ret = __nfs4_state_start();
+ if (ret)
+ return ret;
nfs4_init = 1;
- return;
+ return 0;
}
time_t
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..db0fc55670b3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -57,6 +57,7 @@
#include <linux/nfs4_acl.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -1599,7 +1600,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
{
struct svc_fh tmp_fh;
- char *path, *rootpath;
+ char *path = NULL, *rootpath;
+ size_t rootlen;
fh_init(&tmp_fh, NFS4_FHSIZE);
*stat = exp_pseudoroot(rqstp, &tmp_fh);
@@ -1609,14 +1611,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
path = exp->ex_pathname;
- if (strncmp(path, rootpath, strlen(rootpath))) {
+ rootlen = strlen(rootpath);
+ if (strncmp(path, rootpath, rootlen)) {
dprintk("nfsd: fs_locations failed;"
"%s is not contained in %s\n", path, rootpath);
*stat = nfserr_notsupp;
- return NULL;
+ path = NULL;
+ goto out;
}
-
- return path + strlen(rootpath);
+ path += rootlen;
+out:
+ fh_put(&tmp_fh);
+ return path;
}
/*
@@ -1793,11 +1799,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out_nfserr;
}
}
- if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- if (exp->ex_fslocs.locations == NULL) {
- bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
- }
- }
if ((buflen -= 16) < 0)
goto out_resource;
@@ -1825,8 +1826,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out_resource;
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
- if (!exp->ex_fslocs.locations)
- word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
if (!word2) {
WRITE32(2);
WRITE32(word0);
@@ -3064,6 +3063,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
WRITE32(0);
ADJUST_ARGS();
+ resp->cstate.datap = p; /* DRC cache data pointer */
return 0;
}
@@ -3166,7 +3166,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
return status;
session = resp->cstate.session;
- if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+ if (session == NULL || slot->sl_cachethis == 0)
return status;
if (resp->opcnt >= args->opcnt)
@@ -3291,6 +3291,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
/*
* All that remains is to write the tag and operation count...
*/
+ struct nfsd4_compound_state *cs = &resp->cstate;
struct kvec *iov;
p = resp->tagp;
*p++ = htonl(resp->taglen);
@@ -3304,17 +3305,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov = &rqstp->rq_res.head[0];
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
- if (nfsd4_has_session(&resp->cstate)) {
- if (resp->cstate.status == nfserr_replay_cache &&
- !nfsd4_not_cached(resp)) {
- iov->iov_len = resp->cstate.iovlen;
- } else {
- nfsd4_store_cache_entry(resp);
- dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
- resp->cstate.slot->sl_inuse = 0;
- }
- if (resp->cstate.session)
- nfsd4_put_session(resp->cstate.session);
+ if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+ nfsd4_store_cache_entry(resp);
+ dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+ resp->cstate.slot->sl_inuse = false;
+ nfsd4_put_session(resp->cstate.session);
}
return 1;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
#include <linux/nfsd/xdr.h>
#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
#include <asm/uaccess.h>
#include <net/ipv6.h>
@@ -173,12 +174,13 @@ static const struct file_operations exports_operations = {
};
extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
-static struct file_operations pool_stats_operations = {
+static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = nfsd_pool_stats_release,
.owner = THIS_MODULE,
};
@@ -490,22 +492,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
*
* Input:
* buf: '\n'-terminated C string containing a
- * presentation format IPv4 address
+ * presentation format IP address
* size: length of C string in @buf
* Output:
* On success: returns zero if all specified locks were released;
* returns one if one or more locks were not released
* On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in
*/
static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- };
- int b1, b2, b3, b4;
- char c;
+ struct sockaddr_storage address;
+ struct sockaddr *sap = (struct sockaddr *)&address;
+ size_t salen = sizeof(address);
char *fo_path;
/* sanity check */
@@ -519,14 +517,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
- /* get ipv4 address */
- if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
- return -EINVAL;
- if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+ if (rpc_pton(fo_path, size, sap, salen) == 0)
return -EINVAL;
- sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
- return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
+ return nlmsvc_unlock_all_by_ip(sap);
}
/**
@@ -783,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
size -= len;
mesg += len;
}
-
- mutex_unlock(&nfsd_mutex);
- return (mesg-buf);
-
+ rv = mesg - buf;
out_free:
kfree(nthreads);
mutex_unlock(&nfsd_mutex);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8847f3fbfc1e..d0d8a217a3ea 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -22,6 +22,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/nfsd/nfsd.h>
+#include "vfs.h"
#include "auth.h"
#define NFSDDBG_FACILITY NFSDDBG_FH
@@ -397,44 +398,51 @@ static inline void _fh_update_old(struct dentry *dentry,
fh->ofh_dirino = 0;
}
-__be32
-fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
- struct svc_fh *ref_fh)
+static bool is_root_export(struct svc_export *exp)
{
- /* ref_fh is a reference file handle.
- * if it is non-null and for the same filesystem, then we should compose
- * a filehandle which is of the same version, where possible.
- * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
- * Then create a 32byte filehandle using nfs_fhbase_old
- *
- */
+ return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+}
- u8 version;
- u8 fsid_type = 0;
- struct inode * inode = dentry->d_inode;
- struct dentry *parent = dentry->d_parent;
- __u32 *datap;
- dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev;
- int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root);
+static struct super_block *exp_sb(struct svc_export *exp)
+{
+ return exp->ex_path.dentry->d_inode->i_sb;
+}
- dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
- MAJOR(ex_dev), MINOR(ex_dev),
- (long) exp->ex_path.dentry->d_inode->i_ino,
- parent->d_name.name, dentry->d_name.name,
- (inode ? inode->i_ino : 0));
+static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
+{
+ switch (fsid_type) {
+ case FSID_DEV:
+ if (!old_valid_dev(exp_sb(exp)->s_dev))
+ return 0;
+ /* FALL THROUGH */
+ case FSID_MAJOR_MINOR:
+ case FSID_ENCODE_DEV:
+ return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
+ case FSID_NUM:
+ return exp->ex_flags & NFSEXP_FSID;
+ case FSID_UUID8:
+ case FSID_UUID16:
+ if (!is_root_export(exp))
+ return 0;
+ /* fall through */
+ case FSID_UUID4_INUM:
+ case FSID_UUID16_INUM:
+ return exp->ex_uuid != NULL;
+ }
+ return 1;
+}
- /* Choose filehandle version and fsid type based on
- * the reference filehandle (if it is in the same export)
- * or the export options.
- */
- retry:
+
+static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
+{
+ u8 version;
+ u8 fsid_type;
+retry:
version = 1;
if (ref_fh && ref_fh->fh_export == exp) {
version = ref_fh->fh_handle.fh_version;
fsid_type = ref_fh->fh_handle.fh_fsid_type;
- if (ref_fh == fhp)
- fh_put(ref_fh);
ref_fh = NULL;
switch (version) {
@@ -447,58 +455,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
goto retry;
}
- /* Need to check that this type works for this
- * export point. As the fsid -> filesystem mapping
- * was guided by user-space, there is no guarantee
- * that the filesystem actually supports that fsid
- * type. If it doesn't we loop around again without
- * ref_fh set.
+ /*
+ * As the fsid -> filesystem mapping was guided by
+ * user-space, there is no guarantee that the filesystem
+ * actually supports that fsid type. If it doesn't we
+ * loop around again without ref_fh set.
*/
- switch(fsid_type) {
- case FSID_DEV:
- if (!old_valid_dev(ex_dev))
- goto retry;
- /* FALL THROUGH */
- case FSID_MAJOR_MINOR:
- case FSID_ENCODE_DEV:
- if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
- & FS_REQUIRES_DEV))
- goto retry;
- break;
- case FSID_NUM:
- if (! (exp->ex_flags & NFSEXP_FSID))
- goto retry;
- break;
- case FSID_UUID8:
- case FSID_UUID16:
- if (!root_export)
- goto retry;
- /* fall through */
- case FSID_UUID4_INUM:
- case FSID_UUID16_INUM:
- if (exp->ex_uuid == NULL)
- goto retry;
- break;
- }
+ if (!fsid_type_ok_for_exp(fsid_type, exp))
+ goto retry;
} else if (exp->ex_flags & NFSEXP_FSID) {
fsid_type = FSID_NUM;
} else if (exp->ex_uuid) {
if (fhp->fh_maxsize >= 64) {
- if (root_export)
+ if (is_root_export(exp))
fsid_type = FSID_UUID16;
else
fsid_type = FSID_UUID16_INUM;
} else {
- if (root_export)
+ if (is_root_export(exp))
fsid_type = FSID_UUID8;
else
fsid_type = FSID_UUID4_INUM;
}
- } else if (!old_valid_dev(ex_dev))
+ } else if (!old_valid_dev(exp_sb(exp)->s_dev))
/* for newer device numbers, we must use a newer fsid format */
fsid_type = FSID_ENCODE_DEV;
else
fsid_type = FSID_DEV;
+ fhp->fh_handle.fh_version = version;
+ if (version)
+ fhp->fh_handle.fh_fsid_type = fsid_type;
+}
+
+__be32
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ struct svc_fh *ref_fh)
+{
+ /* ref_fh is a reference file handle.
+ * if it is non-null and for the same filesystem, then we should compose
+ * a filehandle which is of the same version, where possible.
+ * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+ * Then create a 32byte filehandle using nfs_fhbase_old
+ *
+ */
+
+ struct inode * inode = dentry->d_inode;
+ struct dentry *parent = dentry->d_parent;
+ __u32 *datap;
+ dev_t ex_dev = exp_sb(exp)->s_dev;
+
+ dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
+ MAJOR(ex_dev), MINOR(ex_dev),
+ (long) exp->ex_path.dentry->d_inode->i_ino,
+ parent->d_name.name, dentry->d_name.name,
+ (inode ? inode->i_ino : 0));
+
+ /* Choose filehandle version and fsid type based on
+ * the reference filehandle (if it is in the same export)
+ * or the export options.
+ */
+ set_version_and_fsid_type(fhp, exp, ref_fh);
if (ref_fh == fhp)
fh_put(ref_fh);
@@ -516,7 +532,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_export = exp;
cache_get(&exp->h);
- if (version == 0xca) {
+ if (fhp->fh_handle.fh_version == 0xca) {
/* old style filehandle please */
memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;
@@ -530,22 +546,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
_fh_update_old(dentry, exp, &fhp->fh_handle);
} else {
int len;
- fhp->fh_handle.fh_version = 1;
fhp->fh_handle.fh_auth_type = 0;
datap = fhp->fh_handle.fh_auth+0;
- fhp->fh_handle.fh_fsid_type = fsid_type;
- mk_fsid(fsid_type, datap, ex_dev,
+ mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
exp->ex_path.dentry->d_inode->i_ino,
exp->ex_fsid, exp->ex_uuid);
- len = key_len(fsid_type);
+ len = key_len(fhp->fh_handle.fh_fsid_type);
datap += len/4;
fhp->fh_handle.fh_size = 4 + len;
if (inode)
_fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255)
+ if (fhp->fh_handle.fh_fileid_type == 255) {
+ fh_put(fhp);
return nfserr_opnotsupp;
+ }
}
return 0;
@@ -639,8 +655,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
case FSID_DEV:
case FSID_ENCODE_DEV:
case FSID_MAJOR_MINOR:
- if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
- & FS_REQUIRES_DEV)
+ if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
return FSIDSOURCE_DEV;
break;
case FSID_NUM:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0eb9c820b7a6..6c967e1ba37b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -24,6 +24,7 @@
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/cache.h>
#include <linux/nfsd/xdr.h>
+#include "vfs.h"
typedef struct svc_rqst svc_rqst;
typedef struct svc_buf svc_buf;
@@ -758,6 +759,7 @@ nfserrno (int errno)
{ nfserr_io, -ETXTBSY },
{ nfserr_notsupp, -EOPNOTSUPP },
{ nfserr_toosmall, -ETOOSMALL },
+ { nfserr_serverfault, -ESERVERFAULT },
};
int i;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..2944b31dcbe6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,8 @@
#include <linux/nfsd/syscall.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
+#include <linux/seq_file.h>
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_SVC
@@ -66,6 +68,16 @@ struct timeval nfssvc_boot;
DEFINE_MUTEX(nfsd_mutex);
struct svc_serv *nfsd_serv;
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t nfsd_drc_lock;
+unsigned int nfsd_drc_max_mem;
+unsigned int nfsd_drc_mem_used;
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
static struct svc_version * nfsd_acl_version[] = {
@@ -235,13 +247,12 @@ void nfsd_reset_versions(void)
*/
static void set_max_drc(void)
{
- /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
- #define NFSD_DRC_SIZE_SHIFT 7
- nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
- >> NFSD_DRC_SIZE_SHIFT;
- nfsd_serv->sv_drc_pages_used = 0;
- dprintk("%s svc_drc_max_pages %u\n", __func__,
- nfsd_serv->sv_drc_max_pages);
+ #define NFSD_DRC_SIZE_SHIFT 10
+ nfsd_drc_max_mem = (nr_free_buffer_pages()
+ >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+ nfsd_drc_mem_used = 0;
+ spin_lock_init(&nfsd_drc_lock);
+ dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
}
int nfsd_create_serv(void)
@@ -401,7 +412,9 @@ nfsd_svc(unsigned short port, int nrservs)
error = nfsd_racache_init(2*nrservs);
if (error<0)
goto out;
- nfs4_state_start();
+ error = nfs4_state_start();
+ if (error)
+ goto out;
nfsd_reset_versions();
@@ -496,7 +509,9 @@ nfsd(void *vrqstp)
/* Lock the export hash tables for reading. */
exp_readlock();
+ validate_process_creds();
svc_process(rqstp);
+ validate_process_creds();
/* Unlock export hash tables */
exp_readunlock();
@@ -567,10 +582,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ rqstp->rq_res.head[0].iov_len;
rqstp->rq_res.head[0].iov_len += sizeof(__be32);
- /* NFSv4.1 DRC requires statp */
- if (rqstp->rq_vers == 4)
- nfsd4_set_statp(rqstp, statp);
-
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -605,7 +616,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
- if (nfsd_serv == NULL)
+ int ret;
+ mutex_lock(&nfsd_mutex);
+ if (nfsd_serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
return -ENODEV;
- return svc_pool_stats_open(nfsd_serv, file);
+ }
+ /* bump up the psudo refcount while traversing */
+ svc_get(nfsd_serv);
+ ret = svc_pool_stats_open(nfsd_serv, file);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
+}
+
+int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+{
+ int ret = seq_release(inode, file);
+ mutex_lock(&nfsd_mutex);
+ /* this function really, really should have been called svc_put() */
+ svc_destroy(nfsd_serv);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..a7038ede671a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -56,6 +56,7 @@
#endif /* CONFIG_NFSD_V4 */
#include <linux/jhash.h>
#include <linux/ima.h>
+#include "vfs.h"
#include <asm/uaccess.h>
@@ -89,6 +90,12 @@ struct raparm_hbucket {
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
+static inline int
+nfsd_v4client(struct svc_rqst *rq)
+{
+ return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -115,7 +122,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
path_put(&path);
goto out;
}
- if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+ if (nfsd_v4client(rqstp) ||
+ (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
/* successfully crossed mount point */
/*
* This is subtle: path.dentry is *not* on path.mnt
@@ -134,6 +142,40 @@ out:
return err;
}
+static void follow_to_parent(struct path *path)
+{
+ struct dentry *dp;
+
+ while (path->dentry == path->mnt->mnt_root && follow_up(path))
+ ;
+ dp = dget_parent(path->dentry);
+ dput(path->dentry);
+ path->dentry = dp;
+}
+
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+ struct svc_export *exp2;
+ struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+ .dentry = dget(dparent)};
+
+ follow_to_parent(&path);
+
+ exp2 = rqst_exp_parent(rqstp, &path);
+ if (PTR_ERR(exp2) == -ENOENT) {
+ *dentryp = dget(dparent);
+ } else if (IS_ERR(exp2)) {
+ path_put(&path);
+ return PTR_ERR(exp2);
+ } else {
+ *dentryp = dget(path.dentry);
+ exp_put(*exp);
+ *exp = exp2;
+ }
+ path_put(&path);
+ return 0;
+}
+
__be32
nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
const char *name, unsigned int len,
@@ -162,35 +204,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = dget(dparent);
else if (dparent != exp->ex_path.dentry)
dentry = dget_parent(dparent);
- else if (!EX_NOHIDE(exp))
+ else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
dentry = dget(dparent); /* .. == . just like at / */
else {
/* checking mountpoint crossing is very different when stepping up */
- struct svc_export *exp2 = NULL;
- struct dentry *dp;
- struct path path = {.mnt = mntget(exp->ex_path.mnt),
- .dentry = dget(dparent)};
-
- while (path.dentry == path.mnt->mnt_root &&
- follow_up(&path))
- ;
- dp = dget_parent(path.dentry);
- dput(path.dentry);
- path.dentry = dp;
-
- exp2 = rqst_exp_parent(rqstp, &path);
- if (PTR_ERR(exp2) == -ENOENT) {
- dentry = dget(dparent);
- } else if (IS_ERR(exp2)) {
- host_err = PTR_ERR(exp2);
- path_put(&path);
+ host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
+ if (host_err)
goto out_nfserr;
- } else {
- dentry = dget(path.dentry);
- exp_put(exp);
- exp = exp2;
- }
- path_put(&path);
}
} else {
fh_lock(fhp);
@@ -684,6 +704,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
__be32 err;
int host_err;
+ validate_process_creds();
+
/*
* If we get here, then the client has already done an "open",
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +762,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
out_nfserr:
err = nfserrno(host_err);
out:
+ validate_process_creds();
return err;
}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 000000000000..b8011fd2fcab
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP 0
+#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */
+#define NFSD_MAY_READ 4 /* == MAY_READ */
+#define NFSD_MAY_SATTR 8
+#define NFSD_MAY_TRUNC 16
+#define NFSD_MAY_LOCK 32
+#define NFSD_MAY_OWNER_OVERRIDE 64
+#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+
+#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+
+/* nfsd/vfs.c */
+int fh_lock_parent(struct svc_fh *, struct dentry *);
+int nfsd_racache_init(int);
+void nfsd_racache_shutdown(void);
+int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+ struct svc_export **expp);
+__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+ const char *, unsigned int, struct svc_fh *);
+__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+ const char *, unsigned int,
+ struct svc_export **, struct dentry **);
+__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+ struct iattr *, int, time_t);
+#ifdef CONFIG_NFSD_V4
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
+ struct nfs4_acl *);
+int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+#endif /* CONFIG_NFSD_V4 */
+__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ struct svc_fh *res, int createmode,
+ u32 *verifier, int *truncp, int *created);
+__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
+ loff_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+ int, struct file **);
+void nfsd_close(struct file *);
+__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+ loff_t, struct kvec *, int, unsigned long *);
+__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
+ loff_t, struct kvec *,int, unsigned long *, int *);
+__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, char *path, int plen,
+ struct svc_fh *res, struct iattr *);
+__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
+ char *, int, struct svc_fh *);
+__be32 nfsd_rename(struct svc_rqst *,
+ struct svc_fh *, char *, int,
+ struct svc_fh *, char *, int);
+__be32 nfsd_remove(struct svc_rqst *,
+ struct svc_fh *, char *, int);
+__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+ char *name, int len);
+int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+ unsigned long size);
+__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+ loff_t *, struct readdir_cd *, filldir_t);
+__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+ struct kstatfs *, int access);
+
+int nfsd_notify_change(struct inode *, struct iattr *);
+__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
+ struct dentry *, int);
+int nfsd_sync_dir(struct dentry *dp);
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
+int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
+#endif
+
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
config NILFS2_FS
tristate "NILFS2 file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ depends on EXPERIMENTAL
select CRC32
help
NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae59251..3f959f1879d8 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -142,29 +142,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode,
}
}
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+ int create,
+ void (*init_block)(struct inode *,
+ struct buffer_head *,
+ void *),
+ struct buffer_head **bhp,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ int ret;
+
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ get_bh(prev->bh);
+ *bhp = prev->bh;
+ spin_unlock(lock);
+ return 0;
+ }
+ spin_unlock(lock);
+
+ ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+ if (!ret) {
+ spin_lock(lock);
+ /*
+ * The following code must be safe for change of the
+ * cache contents during the get block call.
+ */
+ brelse(prev->bh);
+ get_bh(*bhp);
+ prev->bh = *bhp;
+ prev->blkoff = blkoff;
+ spin_unlock(lock);
+ }
+ return ret;
+}
+
static int nilfs_palloc_get_desc_block(struct inode *inode,
unsigned long group,
int create, struct buffer_head **bhp)
{
- return nilfs_mdt_get_block(inode,
- nilfs_palloc_desc_blkoff(inode, group),
- create, nilfs_palloc_desc_block_init, bhp);
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_get_block(inode,
+ nilfs_palloc_desc_blkoff(inode, group),
+ create, nilfs_palloc_desc_block_init,
+ bhp, &cache->prev_desc, &cache->lock);
}
static int nilfs_palloc_get_bitmap_block(struct inode *inode,
unsigned long group,
int create, struct buffer_head **bhp)
{
- return nilfs_mdt_get_block(inode,
- nilfs_palloc_bitmap_blkoff(inode, group),
- create, NULL, bhp);
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_get_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode, group),
+ create, NULL, bhp,
+ &cache->prev_bitmap, &cache->lock);
}
int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
int create, struct buffer_head **bhp)
{
- return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
- create, NULL, bhp);
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_get_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ create, NULL, bhp,
+ &cache->prev_entry, &cache->lock);
}
static struct nilfs_palloc_group_desc *
@@ -176,13 +222,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
group % nilfs_palloc_groups_per_desc_block(inode);
}
-static unsigned char *
-nilfs_palloc_block_get_bitmap(const struct inode *inode,
- const struct buffer_head *bh, void *kaddr)
-{
- return (unsigned char *)(kaddr + bh_offset(bh));
-}
-
void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
const struct buffer_head *bh, void *kaddr)
{
@@ -289,8 +328,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
if (ret < 0)
goto out_desc;
bitmap_kaddr = kmap(bitmap_bh->b_page);
- bitmap = nilfs_palloc_block_get_bitmap(
- inode, bitmap_bh, bitmap_kaddr);
+ bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
inode, group, group_offset, bitmap,
entries_per_group);
@@ -351,8 +389,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
- bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
- bitmap_kaddr);
+ bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
group_offset, bitmap))
@@ -385,8 +422,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
desc = nilfs_palloc_block_get_group_desc(inode, group,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
- bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
- bitmap_kaddr);
+ bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
group_offset, bitmap))
printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +508,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
bitmap_kaddr = kmap(bitmap_bh->b_page);
- bitmap = nilfs_palloc_block_get_bitmap(
- inode, bitmap_bh, bitmap_kaddr);
+ bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
for (j = i, n = 0;
(j < nitems) && nilfs_palloc_group_is_in(inode, group,
entry_nrs[j]);
@@ -502,3 +537,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
}
return 0;
}
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+ struct nilfs_palloc_cache *cache)
+{
+ NILFS_MDT(inode)->mi_palloc_cache = cache;
+ spin_lock_init(&cache->lock);
+}
+
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ spin_lock(&cache->lock);
+ brelse(cache->prev_desc.bh);
+ brelse(cache->prev_bitmap.bh);
+ brelse(cache->prev_entry.bh);
+ cache->prev_desc.bh = NULL;
+ cache->prev_bitmap.bh = NULL;
+ cache->prev_entry.bh = NULL;
+ spin_unlock(&cache->lock);
+}
+
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+ nilfs_palloc_clear_cache(inode);
+ NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475c2c7..f4543ac4f560 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
+/*
+ * persistent object allocator cache
+ */
+
+struct nilfs_bh_assoc {
+ unsigned long blkoff;
+ struct buffer_head *bh;
+};
+
+struct nilfs_palloc_cache {
+ spinlock_t lock;
+ struct nilfs_bh_assoc prev_desc;
+ struct nilfs_bh_assoc prev_bitmap;
+ struct nilfs_bh_assoc prev_entry;
+};
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+ struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
+
#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..f4a14ea2ed9c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
}
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
__u64 *ptrp)
{
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
return ret;
}
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long *recp)
-{
- __u64 ptr;
- int ret;
-
- /* XXX: use macro for level 1 */
- ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
- if (recp != NULL)
- *recp = ptr;
- return ret;
-}
-
static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -415,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
{
inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
- if (NILFS_MDT(bmap->b_inode))
- nilfs_mdt_mark_dirty(bmap->b_inode);
- else
- mark_inode_dirty(bmap->b_inode);
}
void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
{
inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
- if (NILFS_MDT(bmap->b_inode))
- nilfs_mdt_mark_dirty(bmap->b_inode);
- else
- mark_inode_dirty(bmap->b_inode);
}
__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
@@ -469,104 +448,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
(entries_per_group / NILFS_BMAP_GROUP_DIV);
}
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
- sector_t blocknr)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
- int ret;
-
- ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
- if (likely(!ret))
- nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
- return ret;
-}
-
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
- bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
- sector_t blocknr)
-{
- return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
- return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
- int ret;
-
- ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
- if (ret < 0)
- return ret;
- ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
- if (ret < 0)
- nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-
- return ret;
-}
-
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
-
- nilfs_dat_commit_end(dat, &oldreq->bpr_req,
- bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
- nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-}
-
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
-
- nilfs_dat_abort_end(dat, &oldreq->bpr_req);
- nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-}
-
static struct lock_class_key nilfs_bmap_dat_lock_key;
static struct lock_class_key nilfs_bmap_mdt_lock_key;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
#include <linux/buffer_head.h>
#include <linux/nilfs2_fs.h>
#include "alloc.h"
+#include "dat.h"
#define NILFS_BMAP_INVALID_PTR 0
@@ -141,7 +142,6 @@ struct nilfs_bmap {
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+ __u64 *ptr)
+{
+ return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
/*
* Internal use only
*/
struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- return nilfs_bmap_prepare_alloc_v(bmap, req);
+ if (dat)
+ return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
/* ignore target ptr */
req->bpr_ptr = bmap->b_last_allocated_ptr++;
return 0;
}
static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_commit_alloc_v(bmap, req);
+ if (dat)
+ nilfs_dat_commit_alloc(dat, &req->bpr_req);
}
static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_abort_alloc_v(bmap, req);
+ if (dat)
+ nilfs_dat_abort_alloc(dat, &req->bpr_req);
else
bmap->b_last_allocated_ptr--;
}
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-
static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- return NILFS_BMAP_USE_VBN(bmap) ?
- nilfs_bmap_prepare_end_v(bmap, req) : 0;
+ return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
}
static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_commit_end_v(bmap, req);
+ if (dat)
+ nilfs_dat_commit_end(dat, &req->bpr_req,
+ bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
}
static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_abort_end_v(bmap, req);
+ if (dat)
+ nilfs_dat_abort_end(dat, &req->bpr_req);
}
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
- sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
-
-
__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
const struct buffer_head *);
__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-
void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c668bca579c1..471e269536ae 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
void nilfs_btnode_cache_init_once(struct address_space *btnc)
{
+ memset(btnc, 0, sizeof(*btnc));
INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
spin_lock_init(&btnc->tree_lock);
INIT_LIST_HEAD(&btnc->private_list);
@@ -46,7 +47,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
}
-static struct address_space_operations def_btnode_aops = {
+static const struct address_space_operations def_btnode_aops = {
.sync_page = block_sync_page,
};
@@ -67,9 +68,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
truncate_inode_pages(btnc, 0);
}
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+ struct inode *inode = NILFS_BTNC_I(btnc);
+ struct buffer_head *bh;
+
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ if (unlikely(!bh))
+ return NULL;
+
+ if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+ buffer_dirty(bh))) {
+ brelse(bh);
+ BUG();
+ }
+ memset(bh->b_data, 0, 1 << inode->i_blkbits);
+ bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+ bh->b_blocknr = blocknr;
+ set_buffer_mapped(bh);
+ set_buffer_uptodate(bh);
+
+ unlock_page(bh->b_page);
+ page_cache_release(bh->b_page);
+ return bh;
+}
+
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
- sector_t pblocknr, struct buffer_head **pbh,
- int newblk)
+ sector_t pblocknr, struct buffer_head **pbh)
{
struct buffer_head *bh;
struct inode *inode = NILFS_BTNC_I(btnc);
@@ -80,18 +106,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
return -ENOMEM;
err = -EEXIST; /* internal code */
- if (newblk) {
- if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
- buffer_dirty(bh))) {
- brelse(bh);
- BUG();
- }
- bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
- bh->b_blocknr = blocknr;
- set_buffer_mapped(bh);
- set_buffer_uptodate(bh);
- goto found;
- }
if (buffer_uptodate(bh) || buffer_dirty(bh))
goto found;
@@ -133,27 +147,6 @@ out_locked:
return err;
}
-int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
- sector_t pblocknr, struct buffer_head **pbh, int newblk)
-{
- struct buffer_head *bh;
- int err;
-
- err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
- if (err == -EEXIST) /* internal code (cache hit) */
- return 0;
- if (unlikely(err))
- return err;
-
- bh = *pbh;
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- brelse(bh);
- return -EIO;
- }
- return 0;
-}
-
/**
* nilfs_btnode_delete - delete B-tree node buffer
* @bh: buffer to be deleted
@@ -242,12 +235,13 @@ retry:
unlock_page(obh->b_page);
}
- err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
- if (likely(!err)) {
- BUG_ON(nbh == obh);
- ctxt->newbh = nbh;
- }
- return err;
+ nbh = nilfs_btnode_create_block(btnc, newkey);
+ if (!nbh)
+ return -ENOMEM;
+
+ BUG_ON(nbh == obh);
+ ctxt->newbh = nbh;
+ return 0;
failed_unlock:
unlock_page(obh->b_page);
@@ -275,8 +269,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
- if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
- BUG();
+ nilfs_btnode_mark_dirty(obh);
spin_lock_irq(&btnc->tree_lock);
radix_tree_delete(&btnc->page_tree, oldkey);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e2275172ed6..07da83f07712 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt {
void nilfs_btnode_cache_init_once(struct address_space *);
void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+ __u64 blocknr);
int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
- struct buffer_head **, int);
-int nilfs_btnode_get(struct address_space *, __u64, sector_t,
- struct buffer_head **, int);
+ struct buffer_head **);
void nilfs_btnode_delete(struct buffer_head *);
int nilfs_btnode_prepare_change_key(struct address_space *,
struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..139b113e8338 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
kmem_cache_destroy(nilfs_btree_path_cache);
}
-static inline struct nilfs_btree_path *
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
- return (struct nilfs_btree_path *)
- kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+ return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
}
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
{
kmem_cache_free(nilfs_btree_path_cache, path);
}
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
{
int level;
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
}
}
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
{
int level;
- for (level = NILFS_BTREE_LEVEL_DATA;
- level < NILFS_BTREE_LEVEL_MAX;
- level++) {
- if (path[level].bp_bh != NULL) {
- brelse(path[level].bp_bh);
- path[level].bp_bh = NULL;
- }
- /* sib_bh is released or deleted by prepare or commit
- * operations. */
- path[level].bp_sib_bh = NULL;
- path[level].bp_index = 0;
- path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
- path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
- path[level].bp_op = NULL;
- }
+ for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+ level++)
+ brelse(path[level].bp_bh);
}
/*
@@ -131,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
{
struct address_space *btnc =
&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
- return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+ int err;
+
+ err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
+ if (err)
+ return err == -EEXIST ? 0 : err;
+
+ wait_on_buffer(*bhp);
+ if (!buffer_uptodate(*bhp)) {
+ brelse(*bhp);
+ return -EIO;
+ }
+ return 0;
}
static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -139,138 +133,122 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
{
struct address_space *btnc =
&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
- int ret;
+ struct buffer_head *bh;
- ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
- if (!ret)
- set_buffer_nilfs_volatile(*bhp);
- return ret;
+ bh = nilfs_btnode_create_block(btnc, ptr);
+ if (!bh)
+ return -ENOMEM;
+
+ set_buffer_nilfs_volatile(bh);
+ *bhp = bh;
+ return 0;
}
static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
{
return node->bn_flags;
}
static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int flags)
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
{
node->bn_flags = flags;
}
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
{
- return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+ return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
}
static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
{
return node->bn_level;
}
static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int level)
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
{
node->bn_level = level;
}
static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
{
return le16_to_cpu(node->bn_nchildren);
}
static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int nchildren)
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
{
node->bn_nchildren = cpu_to_le16(nchildren);
}
-static inline int
-nilfs_btree_node_size(const struct nilfs_btree *btree)
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
{
return 1 << btree->bt_bmap.b_inode->i_blkbits;
}
static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return nilfs_btree_node_root(btree, node) ?
+ return nilfs_btree_node_root(node) ?
NILFS_BTREE_ROOT_NCHILDREN_MIN :
NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
}
static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return nilfs_btree_node_root(btree, node) ?
+ return nilfs_btree_node_root(node) ?
NILFS_BTREE_ROOT_NCHILDREN_MAX :
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
}
static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
{
return (__le64 *)((char *)(node + 1) +
- (nilfs_btree_node_root(btree, node) ?
+ (nilfs_btree_node_root(node) ?
0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
}
static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
- nilfs_btree_node_nchildren_max(btree, node));
+ return (__le64 *)(nilfs_btree_node_dkeys(node) +
+ nilfs_btree_node_nchildren_max(node, btree));
}
static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node, int index)
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
{
- return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
- index));
+ return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
}
static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
- struct nilfs_btree_node *node, int index, __u64 key)
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
{
- *(nilfs_btree_node_dkeys(btree, node) + index) =
- nilfs_bmap_key_to_dkey(key);
+ *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
}
static inline __u64
nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node,
- int index)
+ const struct nilfs_btree_node *node, int index)
{
- return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+ return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
index));
}
static inline void
nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int index,
- __u64 ptr)
+ struct nilfs_btree_node *node, int index, __u64 ptr)
{
- *(nilfs_btree_node_dptrs(btree, node) + index) =
+ *(nilfs_btree_node_dptrs(node, btree) + index) =
nilfs_bmap_ptr_to_dptr(ptr);
}
@@ -283,12 +261,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
__le64 *dptrs;
int i;
- nilfs_btree_node_set_flags(btree, node, flags);
- nilfs_btree_node_set_level(btree, node, level);
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_flags(node, flags);
+ nilfs_btree_node_set_level(node, level);
+ nilfs_btree_node_set_nchildren(node, nchildren);
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
for (i = 0; i < nchildren; i++) {
dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +283,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
__le64 *ldptrs, *rdptrs;
int lnchildren, rnchildren;
- ldkeys = nilfs_btree_node_dkeys(btree, left);
- ldptrs = nilfs_btree_node_dptrs(btree, left);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ ldkeys = nilfs_btree_node_dkeys(left);
+ ldptrs = nilfs_btree_node_dptrs(left, btree);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
- rdkeys = nilfs_btree_node_dkeys(btree, right);
- rdptrs = nilfs_btree_node_dptrs(btree, right);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ rdkeys = nilfs_btree_node_dkeys(right);
+ rdptrs = nilfs_btree_node_dptrs(right, btree);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +298,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
lnchildren += n;
rnchildren -= n;
- nilfs_btree_node_set_nchildren(btree, left, lnchildren);
- nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+ nilfs_btree_node_set_nchildren(left, lnchildren);
+ nilfs_btree_node_set_nchildren(right, rnchildren);
}
/* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +312,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
__le64 *ldptrs, *rdptrs;
int lnchildren, rnchildren;
- ldkeys = nilfs_btree_node_dkeys(btree, left);
- ldptrs = nilfs_btree_node_dptrs(btree, left);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ ldkeys = nilfs_btree_node_dkeys(left);
+ ldptrs = nilfs_btree_node_dptrs(left, btree);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
- rdkeys = nilfs_btree_node_dkeys(btree, right);
- rdptrs = nilfs_btree_node_dptrs(btree, right);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ rdkeys = nilfs_btree_node_dkeys(right);
+ rdptrs = nilfs_btree_node_dptrs(right, btree);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +327,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
lnchildren -= n;
rnchildren += n;
- nilfs_btree_node_set_nchildren(btree, left, lnchildren);
- nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+ nilfs_btree_node_set_nchildren(left, lnchildren);
+ nilfs_btree_node_set_nchildren(right, rnchildren);
}
/* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +340,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
__le64 *dptrs;
int nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (index < nchildren) {
memmove(dkeys + index + 1, dkeys + index,
(nchildren - index) * sizeof(*dkeys));
@@ -374,7 +352,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
dkeys[index] = nilfs_bmap_key_to_dkey(key);
dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
nchildren++;
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_nchildren(node, nchildren);
}
/* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +366,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
__le64 *dptrs;
int nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
key = nilfs_bmap_dkey_to_key(dkeys[index]);
ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (keyp != NULL)
*keyp = key;
if (ptrp != NULL)
@@ -405,11 +383,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
(nchildren - index - 1) * sizeof(*dptrs));
}
nchildren--;
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_nchildren(node, nchildren);
}
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
__u64 key, int *indexp)
{
__u64 nkey;
@@ -417,12 +394,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
/* binary search */
low = 0;
- high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ high = nilfs_btree_node_get_nchildren(node) - 1;
index = 0;
s = 0;
while (low <= high) {
index = (low + high) / 2;
- nkey = nilfs_btree_node_get_key(btree, node, index);
+ nkey = nilfs_btree_node_get_key(node, index);
if (nkey == key) {
s = 0;
goto out;
@@ -436,9 +413,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
}
/* adjust index */
- if (nilfs_btree_node_get_level(btree, node) >
- NILFS_BTREE_LEVEL_NODE_MIN) {
- if ((s > 0) && (index > 0))
+ if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+ if (s > 0 && index > 0)
index--;
} else if (s < 0)
index++;
@@ -456,25 +432,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
}
static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
- const struct nilfs_btree_path *path,
- int level)
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
{
return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
}
static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
- const struct nilfs_btree_path *path,
- int level)
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
{
return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
}
static inline int nilfs_btree_height(const struct nilfs_btree *btree)
{
- return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
- + 1;
+ return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
}
static inline struct nilfs_btree_node *
@@ -484,7 +455,19 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
{
return (level == nilfs_btree_height(btree) - 1) ?
nilfs_btree_get_root(btree) :
- nilfs_btree_get_nonroot_node(btree, path, level);
+ nilfs_btree_get_nonroot_node(path, level);
+}
+
+static inline int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+ if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+ dump_stack();
+ printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+ nilfs_btree_node_get_level(node), level);
+ return 1;
+ }
+ return 0;
}
static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +479,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
int level, index, found, ret;
node = nilfs_btree_get_root(btree);
- level = nilfs_btree_node_get_level(btree, node);
- if ((level < minlevel) ||
- (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+ level = nilfs_btree_node_get_level(node);
+ if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
return -ENOENT;
- found = nilfs_btree_node_lookup(btree, node, key, &index);
+ found = nilfs_btree_node_lookup(node, key, &index);
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_bh = NULL;
path[level].bp_index = index;
@@ -510,14 +492,14 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
if (ret < 0)
return ret;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+ node = nilfs_btree_get_nonroot_node(path, level);
+ if (nilfs_btree_bad_node(node, level))
+ return -EINVAL;
if (!found)
- found = nilfs_btree_node_lookup(btree, node, key,
- &index);
+ found = nilfs_btree_node_lookup(node, key, &index);
else
index = 0;
- if (index < nilfs_btree_node_nchildren_max(btree, node))
+ if (index < nilfs_btree_node_nchildren_max(node, btree))
ptr = nilfs_btree_node_get_ptr(btree, node, index);
else {
WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +526,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
int index, level, ret;
node = nilfs_btree_get_root(btree);
- index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ index = nilfs_btree_node_get_nchildren(node) - 1;
if (index < 0)
return -ENOENT;
- level = nilfs_btree_node_get_level(btree, node);
+ level = nilfs_btree_node_get_level(node);
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_bh = NULL;
path[level].bp_index = index;
@@ -556,15 +538,16 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
if (ret < 0)
return ret;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- BUG_ON(level != nilfs_btree_node_get_level(btree, node));
- index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ node = nilfs_btree_get_nonroot_node(path, level);
+ if (nilfs_btree_bad_node(node, level))
+ return -EINVAL;
+ index = nilfs_btree_node_get_nchildren(node) - 1;
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_index = index;
}
if (keyp != NULL)
- *keyp = nilfs_btree_node_get_key(btree, node, index);
+ *keyp = nilfs_btree_node_get_key(node, index);
if (ptrp != NULL)
*ptrp = ptr;
@@ -580,18 +563,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ptrp != NULL)
*ptrp = ptr;
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -608,10 +591,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
int level = NILFS_BTREE_LEVEL_NODE_MIN;
int ret, cnt, index, maxlevel;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ret < 0)
goto out;
@@ -631,8 +614,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
node = nilfs_btree_get_node(btree, path, level);
index = path[level].bp_index + 1;
for (;;) {
- while (index < nilfs_btree_node_get_nchildren(btree, node)) {
- if (nilfs_btree_node_get_key(btree, node, index) !=
+ while (index < nilfs_btree_node_get_nchildren(node)) {
+ if (nilfs_btree_node_get_key(node, index) !=
key + cnt)
goto end;
ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +636,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
/* look-up right sibling node */
node = nilfs_btree_get_node(btree, path, level + 1);
index = path[level + 1].bp_index + 1;
- if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
- nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+ if (index >= nilfs_btree_node_get_nchildren(node) ||
+ nilfs_btree_node_get_key(node, index) != key + cnt)
break;
ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
path[level + 1].bp_index = index;
@@ -664,7 +647,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
if (ret < 0)
goto out;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
index = 0;
path[level].bp_index = index;
}
@@ -672,8 +655,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
*ptrp = ptr;
ret = cnt;
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -683,23 +666,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
{
if (level < nilfs_btree_height(btree) - 1) {
do {
- lock_buffer(path[level].bp_bh);
nilfs_btree_node_set_key(
- btree,
- nilfs_btree_get_nonroot_node(
- btree, path, level),
+ nilfs_btree_get_nonroot_node(path, level),
path[level].bp_index, key);
if (!buffer_dirty(path[level].bp_bh))
nilfs_btnode_mark_dirty(path[level].bp_bh);
- unlock_buffer(path[level].bp_bh);
} while ((path[level].bp_index == 0) &&
(++level < nilfs_btree_height(btree) - 1));
}
/* root */
if (level == nilfs_btree_height(btree) - 1) {
- nilfs_btree_node_set_key(btree,
- nilfs_btree_get_root(btree),
+ nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
path[level].bp_index, key);
}
}
@@ -711,18 +689,16 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
struct nilfs_btree_node *node;
if (level < nilfs_btree_height(btree) - 1) {
- lock_buffer(path[level].bp_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
path[level].bp_index);
if (!buffer_dirty(path[level].bp_bh))
nilfs_btnode_mark_dirty(path[level].bp_bh);
- unlock_buffer(path[level].bp_bh);
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(
- btree, node, 0));
+ nilfs_btree_node_get_key(node,
+ 0));
} else {
node = nilfs_btree_get_root(btree);
nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -737,13 +713,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
struct nilfs_btree_node *node, *left;
int nchildren, lnchildren, n, move;
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
move = 0;
n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -760,11 +733,8 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
if (move) {
brelse(path[level].bp_bh);
@@ -788,13 +758,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
struct nilfs_btree_node *node, *right;
int nchildren, rnchildren, n, move;
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
move = 0;
n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -811,20 +778,16 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, right, 0));
+ nilfs_btree_node_get_key(right, 0));
path[level + 1].bp_index--;
if (move) {
brelse(path[level].bp_bh);
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
- path[level].bp_index -=
- nilfs_btree_node_get_nchildren(btree, node);
+ path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
path[level + 1].bp_index++;
} else {
brelse(path[level].bp_sib_bh);
@@ -843,12 +806,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
__u64 newptr;
int nchildren, n, move;
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
move = 0;
n = (nchildren + 1) / 2;
@@ -864,19 +824,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
- newkey = nilfs_btree_node_get_key(btree, right, 0);
+ newkey = nilfs_btree_node_get_key(right, 0);
newptr = path[level].bp_newreq.bpr_ptr;
if (move) {
- path[level].bp_index -=
- nilfs_btree_node_get_nchildren(btree, node);
+ path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
path[level].bp_index);
- *keyp = nilfs_btree_node_get_key(btree, right, 0);
+ *keyp = nilfs_btree_node_get_key(right, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
brelse(path[level].bp_bh);
@@ -885,7 +841,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
} else {
nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
- *keyp = nilfs_btree_node_get_key(btree, right, 0);
+ *keyp = nilfs_btree_node_get_key(right, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
brelse(path[level].bp_sib_bh);
@@ -902,27 +858,23 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
struct nilfs_btree_node *root, *child;
int n;
- lock_buffer(path[level].bp_sib_bh);
-
root = nilfs_btree_get_root(btree);
- child = nilfs_btree_get_sib_node(btree, path, level);
+ child = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, root);
+ n = nilfs_btree_node_get_nchildren(root);
nilfs_btree_node_move_right(btree, root, child, n);
- nilfs_btree_node_set_level(btree, root, level + 1);
+ nilfs_btree_node_set_level(root, level + 1);
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
- *keyp = nilfs_btree_node_get_key(btree, child, 0);
+ *keyp = nilfs_btree_node_get_key(child, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
}
@@ -990,26 +942,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
struct nilfs_btree_node *node, *parent, *sib;
__u64 sibptr;
int pindex, level, ret;
+ struct inode *dat = NULL;
stats->bs_nblocks = 0;
level = NILFS_BTREE_LEVEL_DATA;
/* allocate a new ptr for data block */
- if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+ if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
path[level].bp_newreq.bpr_ptr =
nilfs_btree_find_target_v(btree, path, key);
+ dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+ }
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_data;
for (level = NILFS_BTREE_LEVEL_NODE_MIN;
level < nilfs_btree_height(btree) - 1;
level++) {
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- if (nilfs_btree_node_get_nchildren(btree, node) <
- nilfs_btree_node_nchildren_max(btree, node)) {
+ node = nilfs_btree_get_nonroot_node(path, level);
+ if (nilfs_btree_node_get_nchildren(node) <
+ nilfs_btree_node_nchildren_max(node, btree)) {
path[level].bp_op = nilfs_btree_do_insert;
stats->bs_nblocks++;
goto out;
@@ -1026,8 +981,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_child_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) <
- nilfs_btree_node_nchildren_max(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) <
+ nilfs_btree_node_nchildren_max(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_carry_left;
stats->bs_nblocks++;
@@ -1038,15 +993,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* right sibling */
if (pindex <
- nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+ nilfs_btree_node_get_nchildren(parent) - 1) {
sibptr = nilfs_btree_node_get_ptr(btree, parent,
pindex + 1);
ret = nilfs_btree_get_block(btree, sibptr, &bh);
if (ret < 0)
goto err_out_child_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) <
- nilfs_btree_node_nchildren_max(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) <
+ nilfs_btree_node_nchildren_max(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_carry_right;
stats->bs_nblocks++;
@@ -1059,7 +1014,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
path[level].bp_newreq.bpr_ptr =
path[level - 1].bp_newreq.bpr_ptr + 1;
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_child_node;
ret = nilfs_btree_get_new_block(btree,
@@ -1070,19 +1025,17 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
stats->bs_nblocks++;
- lock_buffer(bh);
nilfs_btree_node_init(btree,
(struct nilfs_btree_node *)bh->b_data,
0, level, 0, NULL, NULL);
- unlock_buffer(bh);
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_split;
}
/* root */
node = nilfs_btree_get_root(btree);
- if (nilfs_btree_node_get_nchildren(btree, node) <
- nilfs_btree_node_nchildren_max(btree, node)) {
+ if (nilfs_btree_node_get_nchildren(node) <
+ nilfs_btree_node_nchildren_max(node, btree)) {
path[level].bp_op = nilfs_btree_do_insert;
stats->bs_nblocks++;
goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* grow */
path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_child_node;
ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1099,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_curr_node;
- lock_buffer(bh);
nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
0, level, 0, NULL, NULL);
- unlock_buffer(bh);
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_grow;
@@ -1119,16 +1070,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* error */
err_out_curr_node:
- nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+ nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+ dat);
err_out_child_node:
for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
nilfs_btnode_delete(path[level].bp_sib_bh);
nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
}
- nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+ nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+ dat);
err_out_data:
*levelp = level;
stats->bs_nblocks = 0;
@@ -1139,16 +1092,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
int maxlevel, __u64 key, __u64 ptr)
{
+ struct inode *dat = NULL;
int level;
set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
- if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+ if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
nilfs_btree_set_target_v(btree, key, ptr);
+ dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+ }
for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
- &path[level - 1].bp_newreq);
+ &path[level - 1].bp_newreq, dat);
path[level].bp_op(btree, path, level, &key, &ptr);
}
@@ -1164,10 +1120,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1140,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1196,16 +1152,14 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
struct nilfs_btree_node *node;
if (level < nilfs_btree_height(btree) - 1) {
- lock_buffer(path[level].bp_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_delete(btree, node, keyp, ptrp,
path[level].bp_index);
if (!buffer_dirty(path[level].bp_bh))
nilfs_btnode_mark_dirty(path[level].bp_bh);
- unlock_buffer(path[level].bp_bh);
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
} else {
node = nilfs_btree_get_root(btree);
nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1222,13 +1176,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
n = (nchildren + lnchildren) / 2 - nchildren;
@@ -1239,11 +1190,8 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
brelse(path[level].bp_sib_bh);
path[level].bp_sib_bh = NULL;
@@ -1259,13 +1207,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
n = (nchildren + rnchildren) / 2 - nchildren;
@@ -1276,12 +1221,9 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, right, 0));
+ nilfs_btree_node_get_key(right, 0));
path[level + 1].bp_index--;
brelse(path[level].bp_sib_bh);
@@ -1297,26 +1239,20 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, node);
+ n = nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_move_left(btree, left, node, n);
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
nilfs_btnode_delete(path[level].bp_bh);
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
- path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+ path[level].bp_index += nilfs_btree_node_get_nchildren(left);
}
static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1328,22 +1264,16 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
- lock_buffer(path[level].bp_bh);
- lock_buffer(path[level].bp_sib_bh);
-
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, right);
+ n = nilfs_btree_node_get_nchildren(right);
nilfs_btree_node_move_left(btree, node, right, n);
if (!buffer_dirty(path[level].bp_bh))
nilfs_btnode_mark_dirty(path[level].bp_bh);
- unlock_buffer(path[level].bp_bh);
- unlock_buffer(path[level].bp_sib_bh);
-
nilfs_btnode_delete(path[level].bp_sib_bh);
path[level].bp_sib_bh = NULL;
path[level + 1].bp_index++;
@@ -1358,15 +1288,13 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
- lock_buffer(path[level].bp_bh);
root = nilfs_btree_get_root(btree);
- child = nilfs_btree_get_nonroot_node(btree, path, level);
+ child = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
- nilfs_btree_node_set_level(btree, root, level);
- n = nilfs_btree_node_get_nchildren(btree, child);
+ nilfs_btree_node_set_level(root, level);
+ n = nilfs_btree_node_get_nchildren(child);
nilfs_btree_node_move_left(btree, root, child, n);
- unlock_buffer(path[level].bp_bh);
nilfs_btnode_delete(path[level].bp_bh);
path[level].bp_bh = NULL;
@@ -1376,7 +1304,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
int *levelp,
- struct nilfs_bmap_stats *stats)
+ struct nilfs_bmap_stats *stats,
+ struct inode *dat)
{
struct buffer_head *bh;
struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1317,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
for (level = NILFS_BTREE_LEVEL_NODE_MIN;
level < nilfs_btree_height(btree) - 1;
level++) {
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
path[level].bp_oldreq.bpr_ptr =
nilfs_btree_node_get_ptr(btree, node,
path[level].bp_index);
ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
if (ret < 0)
goto err_out_child_node;
- if (nilfs_btree_node_get_nchildren(btree, node) >
- nilfs_btree_node_nchildren_min(btree, node)) {
+ if (nilfs_btree_node_get_nchildren(node) >
+ nilfs_btree_node_nchildren_min(node, btree)) {
path[level].bp_op = nilfs_btree_do_delete;
stats->bs_nblocks++;
goto out;
@@ -1415,8 +1344,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_curr_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) >
- nilfs_btree_node_nchildren_min(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) >
+ nilfs_btree_node_nchildren_min(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_borrow_left;
stats->bs_nblocks++;
@@ -1428,7 +1357,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* continue; */
}
} else if (pindex <
- nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+ nilfs_btree_node_get_nchildren(parent) - 1) {
/* right sibling */
sibptr = nilfs_btree_node_get_ptr(btree, parent,
pindex + 1);
@@ -1436,8 +1365,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_curr_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) >
- nilfs_btree_node_nchildren_min(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) >
+ nilfs_btree_node_nchildren_min(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_borrow_right;
stats->bs_nblocks++;
@@ -1452,7 +1381,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* no siblings */
/* the only child of the root node */
WARN_ON(level != nilfs_btree_height(btree) - 2);
- if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+ if (nilfs_btree_node_get_nchildren(node) - 1 <=
NILFS_BTREE_ROOT_NCHILDREN_MAX) {
path[level].bp_op = nilfs_btree_shrink;
stats->bs_nblocks += 2;
@@ -1471,7 +1400,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
if (ret < 0)
goto err_out_child_node;
@@ -1486,12 +1415,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* error */
err_out_curr_node:
- nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+ nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
err_out_child_node:
for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
brelse(path[level].bp_sib_bh);
nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
}
*levelp = level;
stats->bs_nblocks = 0;
@@ -1500,13 +1429,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int maxlevel)
+ int maxlevel, struct inode *dat)
{
int level;
for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
path[level].bp_op(btree, path, level, NULL, NULL);
}
@@ -1520,27 +1449,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
struct nilfs_btree *btree;
struct nilfs_btree_path *path;
struct nilfs_bmap_stats stats;
+ struct inode *dat;
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
if (ret < 0)
goto out;
- ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+
+ dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+ nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+
+ ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
if (ret < 0)
goto out;
- nilfs_btree_commit_delete(btree, path, level);
+ nilfs_btree_commit_delete(btree, path, level, dat);
nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1551,15 +1485,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1581,7 +1515,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
node = root;
break;
case 3:
- nchildren = nilfs_btree_node_get_nchildren(btree, root);
+ nchildren = nilfs_btree_node_get_nchildren(root);
if (nchildren > 1)
return 0;
ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1528,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
return 0;
}
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
nextmaxkey = (nchildren > 1) ?
- nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+ nilfs_btree_node_get_key(node, nchildren - 2) : 0;
if (bh != NULL)
brelse(bh);
@@ -1623,7 +1557,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
node = root;
break;
case 3:
- nchildren = nilfs_btree_node_get_nchildren(btree, root);
+ nchildren = nilfs_btree_node_get_nchildren(root);
WARN_ON(nchildren > 1);
ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1570,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
return -EINVAL;
}
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (nchildren < nitems)
nitems = nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
for (i = 0; i < nitems; i++) {
keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1594,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
struct nilfs_bmap_stats *stats)
{
struct buffer_head *bh;
- struct nilfs_btree *btree;
+ struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+ struct inode *dat = NULL;
int ret;
- btree = (struct nilfs_btree *)bmap;
stats->bs_nblocks = 0;
/* for data */
/* cannot find near ptr */
- if (NILFS_BMAP_USE_VBN(bmap))
+ if (NILFS_BMAP_USE_VBN(bmap)) {
dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+ dat = nilfs_bmap_get_dat(bmap);
+ }
- ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
if (ret < 0)
return ret;
@@ -1679,7 +1615,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
stats->bs_nblocks++;
if (nreq != NULL) {
nreq->bpr_ptr = dreq->bpr_ptr + 1;
- ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
if (ret < 0)
goto err_out_dreq;
@@ -1696,9 +1632,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
/* error */
err_out_nreq:
- nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+ nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
err_out_dreq:
- nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+ nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
stats->bs_nblocks = 0;
return ret;
@@ -1713,8 +1649,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
union nilfs_bmap_ptr_req *nreq,
struct buffer_head *bh)
{
- struct nilfs_btree *btree;
+ struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
struct nilfs_btree_node *node;
+ struct inode *dat;
__u64 tmpptr;
/* free resources */
@@ -1725,14 +1662,13 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
/* convert and insert */
- btree = (struct nilfs_btree *)bmap;
+ dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
nilfs_btree_init(bmap);
if (nreq != NULL) {
- nilfs_bmap_commit_alloc_ptr(bmap, dreq);
- nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+ nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+ nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
/* create child node at level 1 */
- lock_buffer(bh);
node = (struct nilfs_btree_node *)bh->b_data;
nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
nilfs_btree_node_insert(btree, node,
@@ -1742,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
if (!nilfs_bmap_dirty(bmap))
nilfs_bmap_set_dirty(bmap);
- unlock_buffer(bh);
brelse(bh);
/* create root node at level 2 */
@@ -1751,7 +1686,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
2, 1, &keys[0], &tmpptr);
} else {
- nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+ nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
/* create root node at level 1 */
node = nilfs_btree_get_root(btree);
@@ -1822,7 +1757,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
struct nilfs_btree_node *parent;
int ret;
@@ -1832,9 +1767,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
- ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
if (ret < 0)
return ret;
@@ -1846,9 +1780,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
&path[level].bp_ctxt);
if (ret < 0) {
- nilfs_bmap_abort_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_abort_update(dat,
+ &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
return ret;
}
}
@@ -1858,13 +1792,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
struct nilfs_btree_node *parent;
- nilfs_bmap_commit_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req,
+ btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
if (buffer_nilfs_node(path[level].bp_bh)) {
nilfs_btnode_commit_change_key(
@@ -1881,11 +1815,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
- nilfs_bmap_abort_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
if (buffer_nilfs_node(path[level].bp_bh))
nilfs_btnode_abort_change_key(
&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1827,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int minlevel,
- int *maxlevelp)
+ int minlevel, int *maxlevelp,
+ struct inode *dat)
{
int level, ret;
level = minlevel;
if (!buffer_nilfs_volatile(path[level].bp_bh)) {
- ret = nilfs_btree_prepare_update_v(btree, path, level);
+ ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
if (ret < 0)
return ret;
}
@@ -1909,7 +1842,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
!buffer_dirty(path[level].bp_bh)) {
WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
- ret = nilfs_btree_prepare_update_v(btree, path, level);
+ ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
if (ret < 0)
goto out;
}
@@ -1921,39 +1854,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
/* error */
out:
while (--level > minlevel)
- nilfs_btree_abort_update_v(btree, path, level);
+ nilfs_btree_abort_update_v(btree, path, level, dat);
if (!buffer_nilfs_volatile(path[level].bp_bh))
- nilfs_btree_abort_update_v(btree, path, level);
+ nilfs_btree_abort_update_v(btree, path, level, dat);
return ret;
}
static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int minlevel,
- int maxlevel,
- struct buffer_head *bh)
+ int minlevel, int maxlevel,
+ struct buffer_head *bh,
+ struct inode *dat)
{
int level;
if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
- nilfs_btree_commit_update_v(btree, path, minlevel);
+ nilfs_btree_commit_update_v(btree, path, minlevel, dat);
for (level = minlevel + 1; level <= maxlevel; level++)
- nilfs_btree_commit_update_v(btree, path, level);
+ nilfs_btree_commit_update_v(btree, path, level, dat);
}
static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level,
- struct buffer_head *bh)
+ int level, struct buffer_head *bh)
{
int maxlevel, ret;
struct nilfs_btree_node *parent;
+ struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
__u64 ptr;
get_bh(bh);
path[level].bp_bh = bh;
- ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+ ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+ dat);
if (ret < 0)
goto out;
@@ -1961,12 +1895,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
parent = nilfs_btree_get_node(btree, path, level + 1);
ptr = nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
- ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+ ret = nilfs_dat_mark_dirty(dat, ptr);
if (ret < 0)
goto out;
}
- nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+ nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
out:
brelse(path[level].bp_bh);
@@ -1986,15 +1920,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
WARN_ON(!buffer_dirty(bh));
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
if (buffer_nilfs_node(bh)) {
node = (struct nilfs_btree_node *)bh->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
} else {
key = nilfs_bmap_data_get_key(bmap, bh);
level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1947,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
nilfs_btree_propagate_p(btree, path, level, bh);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -2022,7 +1956,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
struct buffer_head *bh)
{
- return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+ return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
}
static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +1971,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
get_bh(bh);
node = (struct nilfs_btree_node *)bh->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
list_for_each(head, &lists[level]) {
cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
cnode = (struct nilfs_btree_node *)cbh->b_data;
- ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+ ckey = nilfs_btree_node_get_key(cnode, 0);
if (key < ckey)
break;
}
@@ -2120,8 +2054,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
nilfs_btree_node_set_ptr(btree, parent,
path[level + 1].bp_index, blocknr);
- key = nilfs_btree_node_get_key(btree, parent,
- path[level + 1].bp_index);
+ key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
/* on-disk format */
binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
binfo->bi_dat.bi_level = level;
@@ -2137,6 +2070,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
union nilfs_binfo *binfo)
{
struct nilfs_btree_node *parent;
+ struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
__u64 key;
__u64 ptr;
union nilfs_bmap_ptr_req req;
@@ -2146,12 +2080,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
ptr = nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
req.bpr_ptr = ptr;
- ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
- if (unlikely(ret < 0))
+ ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+ if (ret < 0)
return ret;
+ nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
- key = nilfs_btree_node_get_key(btree, parent,
- path[level + 1].bp_index);
+ key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
/* on-disk format */
binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2105,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
if (buffer_nilfs_node(*bh)) {
node = (struct nilfs_btree_node *)(*bh)->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
} else {
key = nilfs_bmap_data_get_key(bmap, *bh);
level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2130,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -2207,19 +2141,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
sector_t blocknr,
union nilfs_binfo *binfo)
{
- struct nilfs_btree *btree;
struct nilfs_btree_node *node;
__u64 key;
int ret;
- btree = (struct nilfs_btree *)bmap;
- ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+ ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+ blocknr);
if (ret < 0)
return ret;
if (buffer_nilfs_node(*bh)) {
node = (struct nilfs_btree_node *)(*bh)->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
+ key = nilfs_btree_node_get_key(node, 0);
} else
key = nilfs_bmap_data_get_key(bmap, *bh);
@@ -2239,10 +2172,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
if (ret < 0) {
@@ -2262,8 +2195,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
nilfs_bmap_set_dirty(&btree->bt_bmap);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbbc6b64..4b82d84ade75 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
struct nilfs_btree_path;
/**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
- __u8 bn_flags;
- __u8 bn_level;
- __le16 bn_nchildren;
- __le32 bn_pad;
-};
-
-/* flags */
-#define NILFS_BTREE_NODE_ROOT 0x01
-
-/* level */
-#define NILFS_BTREE_LEVEL_DATA 0
-#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX 14
-
-/**
* struct nilfs_btree - B-tree structure
* @bt_bmap: bmap base structure
*/
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..d5ad54e204a5 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
void *kaddr;
int ret;
- if (cno == 0)
- return -ENOENT; /* checkpoint number 0 is invalid */
+ /* CP number is invalid if it's zero or larger than the
+ largest exist one.*/
+ if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+ return -ENOENT;
down_read(&NILFS_MDT(cpfile)->mi_sem);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
goto out;
kaddr = kmap_atomic(bh->b_page, KM_USER0);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
- ret = nilfs_checkpoint_snapshot(cp);
+ if (nilfs_checkpoint_invalid(cp))
+ ret = -ENOENT;
+ else
+ ret = nilfs_checkpoint_snapshot(cp);
kunmap_atomic(kaddr, KM_USER0);
brelse(bh);
@@ -866,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
* exclusive with a new mount job. Though it doesn't cover
* umount, it's enough for the purpose.
*/
- mutex_lock(&nilfs->ns_mount_mutex);
if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
/* Current implementation does not have to protect
plain read-only mounts since they are exclusive
@@ -875,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
ret = -EBUSY;
} else
ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
- mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
case NILFS_SNAPSHOT:
return nilfs_cpfile_set_snapshot(cpfile, cno);
@@ -923,3 +926,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
up_read(&NILFS_MDT(cpfile)->mi_sem);
return ret;
}
+
+/**
+ * nilfs_cpfile_read - read cpfile inode
+ * @cpfile: cpfile inode
+ * @raw_inode: on-disk cpfile inode
+ */
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
+{
+ return nilfs_read_inode_common(cpfile, raw_inode);
+}
+
+/**
+ * nilfs_cpfile_new - create cpfile
+ * @nilfs: nilfs object
+ * @cpsize: size of a checkpoint entry
+ */
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+{
+ struct inode *cpfile;
+
+ cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+ if (cpfile)
+ nilfs_mdt_set_entry_size(cpfile, cpsize,
+ sizeof(struct nilfs_cpfile_header));
+ return cpfile;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..bc0809e0ab43 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
#include <linux/buffer_head.h>
#include <linux/nilfs2_fs.h>
-#define NILFS_CPFILE_GFP NILFS_MDT_GFP
-
int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
struct nilfs_checkpoint **,
@@ -42,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
size_t);
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+
#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..187dd07ba86c 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
#define NILFS_CNO_MIN ((__u64)1)
#define NILFS_CNO_MAX (~(__u64)0)
+struct nilfs_dat_info {
+ struct nilfs_mdt_info mi;
+ struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+ return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
+
static int nilfs_dat_prepare_entry(struct inode *dat,
struct nilfs_palloc_req *req, int create)
{
@@ -109,12 +119,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
nilfs_palloc_commit_free_entry(dat, req);
}
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
- nilfs_dat_abort_entry(dat, req);
- nilfs_palloc_abort_free_entry(dat, req);
-}
-
int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
@@ -140,11 +144,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
nilfs_dat_commit_entry(dat, req);
}
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
- nilfs_dat_abort_entry(dat, req);
-}
-
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
@@ -222,6 +221,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
nilfs_dat_abort_entry(dat, req);
}
+int nilfs_dat_prepare_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq)
+{
+ int ret;
+
+ ret = nilfs_dat_prepare_end(dat, oldreq);
+ if (!ret) {
+ ret = nilfs_dat_prepare_alloc(dat, newreq);
+ if (ret < 0)
+ nilfs_dat_abort_end(dat, oldreq);
+ }
+ return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq, int dead)
+{
+ nilfs_dat_commit_end(dat, oldreq, dead);
+ nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq)
+{
+ nilfs_dat_abort_end(dat, oldreq);
+ nilfs_dat_abort_alloc(dat, newreq);
+}
+
/**
* nilfs_dat_mark_dirty -
* @dat: DAT file inode
@@ -405,3 +435,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
return nvi;
}
+
+/**
+ * nilfs_dat_read - read dat inode
+ * @dat: dat inode
+ * @raw_inode: on-disk dat inode
+ */
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
+{
+ return nilfs_read_inode_common(dat, raw_inode);
+}
+
+/**
+ * nilfs_dat_new - create dat file
+ * @nilfs: nilfs object
+ * @entry_size: size of a dat entry
+ */
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+{
+ static struct lock_class_key dat_lock_key;
+ struct inode *dat;
+ struct nilfs_dat_info *di;
+ int err;
+
+ dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+ if (dat) {
+ err = nilfs_palloc_init_blockgroup(dat, entry_size);
+ if (unlikely(err)) {
+ nilfs_mdt_destroy(dat);
+ return NULL;
+ }
+
+ di = NILFS_DAT_I(dat);
+ lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+ nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+ }
+ return dat;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..d31c3aab0efe 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#define NILFS_DAT_GFP NILFS_MDT_GFP
struct nilfs_palloc_req;
@@ -39,14 +38,22 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *);
int nilfs_dat_mark_dirty(struct inode *, __u64);
int nilfs_dat_freev(struct inode *, __u64 *, size_t);
int nilfs_dat_move(struct inode *, __u64, sector_t);
ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+
#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
return 0;
}
-struct file_operations nilfs_dir_operations = {
+const struct file_operations nilfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = nilfs_readdir,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
direct->d_bmap.b_last_allocated_ptr = ptr;
}
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
- __u64 key,
- union nilfs_bmap_ptr_req *req,
- struct nilfs_bmap_stats *stats)
-{
- int ret;
-
- if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
- req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
- ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
- if (ret < 0)
- return ret;
-
- stats->bs_nblocks = 1;
- return 0;
-}
-
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key, __u64 ptr)
-{
- struct buffer_head *bh;
-
- /* ptr must be a pointer to a buffer head. */
- bh = (struct buffer_head *)((unsigned long)ptr);
- set_buffer_nilfs_volatile(bh);
-
- nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
- nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-
- if (!nilfs_bmap_dirty(&direct->d_bmap))
- nilfs_bmap_set_dirty(&direct->d_bmap);
-
- if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
- nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
-
static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
- struct nilfs_direct *direct;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
union nilfs_bmap_ptr_req req;
- struct nilfs_bmap_stats stats;
+ struct inode *dat = NULL;
+ struct buffer_head *bh;
int ret;
- direct = (struct nilfs_direct *)bmap;
if (key > NILFS_DIRECT_KEY_MAX)
return -ENOENT;
if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
return -EEXIST;
- ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
- if (ret < 0)
- return ret;
- nilfs_direct_commit_insert(direct, &req, key, ptr);
- nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+ if (NILFS_BMAP_USE_VBN(bmap)) {
+ req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+ dat = nilfs_bmap_get_dat(bmap);
+ }
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+ if (!ret) {
+ /* ptr must be a pointer to a buffer head. */
+ bh = (struct buffer_head *)((unsigned long)ptr);
+ set_buffer_nilfs_volatile(bh);
- return 0;
-}
+ nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+ nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key,
- struct nilfs_bmap_stats *stats)
-{
- int ret;
+ if (!nilfs_bmap_dirty(bmap))
+ nilfs_bmap_set_dirty(bmap);
- req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
- ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
- if (!ret)
- stats->bs_nblocks = 1;
- return ret;
-}
+ if (NILFS_BMAP_USE_VBN(bmap))
+ nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key)
-{
- nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
- nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+ nilfs_bmap_add_blocks(bmap, 1);
+ }
+ return ret;
}
static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
{
- struct nilfs_direct *direct;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
union nilfs_bmap_ptr_req req;
- struct nilfs_bmap_stats stats;
+ struct inode *dat;
int ret;
- direct = (struct nilfs_direct *)bmap;
- if ((key > NILFS_DIRECT_KEY_MAX) ||
+ if (key > NILFS_DIRECT_KEY_MAX ||
nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
return -ENOENT;
- ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
- if (ret < 0)
- return ret;
- nilfs_direct_commit_delete(direct, &req, key);
- nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+ dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+ req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
- return 0;
+ ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+ if (!ret) {
+ nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+ nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+ nilfs_bmap_sub_blocks(bmap, 1);
+ }
+ return ret;
}
static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
return 0;
}
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
- struct buffer_head *bh)
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+ struct buffer_head *bh)
{
- union nilfs_bmap_ptr_req oldreq, newreq;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+ struct nilfs_palloc_req oldreq, newreq;
+ struct inode *dat;
__u64 key;
__u64 ptr;
int ret;
- key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+ if (!NILFS_BMAP_USE_VBN(bmap))
+ return 0;
+
+ dat = nilfs_bmap_get_dat(bmap);
+ key = nilfs_bmap_data_get_key(bmap, bh);
ptr = nilfs_direct_get_ptr(direct, key);
if (!buffer_nilfs_volatile(bh)) {
- oldreq.bpr_ptr = ptr;
- newreq.bpr_ptr = ptr;
- ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
- &newreq);
+ oldreq.pr_entry_nr = ptr;
+ newreq.pr_entry_nr = ptr;
+ ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
if (ret < 0)
return ret;
- nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+ nilfs_dat_commit_update(dat, &oldreq, &newreq,
+ bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
set_buffer_nilfs_volatile(bh);
- nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+ nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
} else
- ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+ ret = nilfs_dat_mark_dirty(dat, ptr);
return ret;
}
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
- struct buffer_head *bh)
-{
- struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-
- return NILFS_BMAP_USE_VBN(bmap) ?
- nilfs_direct_propagate_v(direct, bh) : 0;
-}
-
static int nilfs_direct_assign_v(struct nilfs_direct *direct,
__u64 key, __u64 ptr,
struct buffer_head **bh,
sector_t blocknr,
union nilfs_binfo *binfo)
{
+ struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
union nilfs_bmap_ptr_req req;
int ret;
req.bpr_ptr = ptr;
- ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
- if (unlikely(ret < 0))
- return ret;
-
- binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
- binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-
- return 0;
+ ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+ if (!ret) {
+ nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+ binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+ }
+ return ret;
}
static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return 0;
}
-struct vm_operations_struct nilfs_file_vm_ops = {
+static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nilfs_page_mkwrite,
};
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* We have mostly NULL's here: the current defaults are ok for
* the nilfs filesystem.
*/
-struct file_operations nilfs_file_operations = {
+const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations nilfs_file_inode_operations = {
+const struct inode_operations nilfs_file_inode_operations = {
.truncate = nilfs_truncate,
.setattr = nilfs_setattr,
.permission = nilfs_permission,
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5cee90..dd5f7e0a95f6 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+ nilfs_palloc_clear_cache(dat);
+ nilfs_palloc_clear_cache(gcdat);
nilfs_clear_dirty_pages(mapping);
nilfs_copy_back_pages(mapping, gmapping);
/* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
gcdat->i_state = I_CLEAR;
gii->i_flags = 0;
+ nilfs_palloc_clear_cache(gcdat);
truncate_inode_pages(gcdat->i_mapping, 0);
truncate_inode_pages(&gii->i_btnode_cache, 0);
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e16a6664dfa2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
#include "dat.h"
#include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {
+static const struct address_space_operations def_gcinode_aops = {
.sync_page = block_sync_page,
};
@@ -149,7 +149,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
__u64 vbn, struct buffer_head **out_bh)
{
int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
- vbn ? : pbn, pbn, out_bh, 0);
+ vbn ? : pbn, pbn, out_bh);
if (ret == -EEXIST) /* internal code (cache hit) */
ret = 0;
return ret;
@@ -212,9 +212,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs)
static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
__u64 cno)
{
- struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+ struct inode *inode;
struct nilfs_inode_info *ii;
+ inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
if (!inode)
return NULL;
@@ -265,7 +266,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
*/
void nilfs_clear_gcinode(struct inode *inode)
{
- nilfs_mdt_clear(inode);
nilfs_mdt_destroy(inode);
}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401f209f..922d9dd42c8f 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
#include "alloc.h"
#include "ifile.h"
+
+struct nilfs_ifile_info {
+ struct nilfs_mdt_info mi;
+ struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+ return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
+
/**
* nilfs_ifile_create_inode - create a new disk inode
* @ifile: ifile inode
@@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
}
return err;
}
+
+/**
+ * nilfs_ifile_new - create inode file
+ * @sbi: nilfs_sb_info struct
+ * @inode_size: size of an inode
+ */
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+{
+ struct inode *ifile;
+ int err;
+
+ ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+ sizeof(struct nilfs_ifile_info));
+ if (ifile) {
+ err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+ if (unlikely(err)) {
+ nilfs_mdt_destroy(ifile);
+ return NULL;
+ }
+ nilfs_palloc_setup_cache(ifile,
+ &NILFS_IFILE_I(ifile)->palloc_cache);
+ }
+ return ifile;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..cbca32e498f2 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
#include "mdt.h"
#include "alloc.h"
-#define NILFS_IFILE_GFP NILFS_MDT_GFP
static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
@@ -50,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
int nilfs_ifile_delete_inode(struct inode *, ino_t);
int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+
#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..a16c179f2b9a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -97,6 +97,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
nilfs_transaction_abort(inode->i_sb);
goto out;
}
+ mark_inode_dirty(inode);
nilfs_transaction_commit(inode->i_sb); /* never fails */
/* Error handling should be detailed */
set_buffer_new(bh_result);
@@ -238,7 +239,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
return size;
}
-struct address_space_operations nilfs_aops = {
+const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.sync_page = block_sync_page,
@@ -400,6 +401,7 @@ int nilfs_read_inode_common(struct inode *inode,
ii->i_dir_acl = S_ISREG(inode->i_mode) ?
0 : le32_to_cpu(raw_inode->i_dir_acl);
#endif
+ ii->i_dir_start_lookup = 0;
ii->i_cno = 0;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
@@ -430,7 +432,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
- if (nilfs_read_inode_common(inode, raw_inode))
+ err = nilfs_read_inode_common(inode, raw_inode);
+ if (err)
goto failed_unmap;
if (S_ISREG(inode->i_mode)) {
@@ -523,7 +526,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
- /* The buffer is guarded with lock_buffer() by the caller */
if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -597,6 +599,7 @@ void nilfs_truncate(struct inode *inode)
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
+ mark_inode_dirty(inode);
nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
nilfs_transaction_commit(sb);
/* May construct a logical segment and may fail in sync mode.
@@ -621,6 +624,7 @@ void nilfs_delete_inode(struct inode *inode)
truncate_inode_pages(&inode->i_data, 0);
nilfs_truncate_bmap(ii, 0);
+ mark_inode_dirty(inode);
nilfs_free_inode(inode);
/* nilfs_free_inode() marks inode buffer dirty */
if (IS_SYNC(inode))
@@ -662,7 +666,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
int err;
spin_lock(&sbi->s_inode_lock);
- /* Caller of this function MUST lock s_inode_lock */
if (ii->i_bh == NULL) {
spin_unlock(&sbi->s_inode_lock);
err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
@@ -744,9 +747,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
"failed to reget inode block.\n");
return err;
}
- lock_buffer(ibh);
nilfs_update_inode(inode, ibh);
- unlock_buffer(ibh);
nilfs_mdt_mark_buffer_dirty(ibh);
nilfs_mdt_mark_dirty(sbi->s_ifile);
brelse(ibh);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..f6af76042d80 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
- struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+ struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+ struct inode *cpfile = nilfs->ns_cpfile;
struct nilfs_transaction_info ti;
struct nilfs_cpmode cpmode;
int ret;
@@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
return -EFAULT;
+ mutex_lock(&nilfs->ns_mount_mutex);
nilfs_transaction_begin(inode->i_sb, &ti, 0);
ret = nilfs_cpfile_change_cpmode(
cpfile, cpmode.cm_cno, cpmode.cm_mode);
if (unlikely(ret < 0)) {
nilfs_transaction_abort(inode->i_sb);
+ mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
}
nilfs_transaction_commit(inode->i_sb); /* never fails */
+ mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
}
@@ -297,7 +301,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
(unsigned long long)vdesc->vd_vblocknr);
return ret;
}
- bh->b_private = vdesc;
+ if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
+ printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
+ "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
+ brelse(bh);
+ return -EEXIST;
+ }
list_add_tail(&bh->b_assoc_buffers, buffers);
return 0;
}
@@ -335,24 +350,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
ret = nilfs_gccache_wait_and_mark_dirty(bh);
if (unlikely(ret < 0)) {
- if (ret == -EEXIST) {
- vdesc = bh->b_private;
- printk(KERN_CRIT
- "%s: conflicting %s buffer: "
- "ino=%llu, cno=%llu, offset=%llu, "
- "blocknr=%llu, vblocknr=%llu\n",
- __func__,
- vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
- }
+ WARN_ON(ret == -EEXIST);
goto failed;
}
list_del_init(&bh->b_assoc_buffers);
- bh->b_private = NULL;
brelse(bh);
}
return nmembs;
@@ -360,7 +361,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
failed:
list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
list_del_init(&bh->b_assoc_buffers);
- bh->b_private = NULL;
brelse(bh);
}
return ret;
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
const char *msg;
int ret;
- ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
- if (ret < 0) {
- msg = "cannot read source blocks";
- goto failed;
- }
-
ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
if (ret < 0) {
/*
@@ -477,7 +471,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- nilfs_remove_all_gcinode(nilfs);
printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
msg, ret);
return ret;
@@ -548,7 +541,27 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
}
}
- ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+ /*
+ * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+ * which will operates an inode list without blocking.
+ * To protect the list from concurrent operations,
+ * nilfs_ioctl_move_blocks should be atomic operation.
+ */
+ if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+ if (ret < 0)
+ printk(KERN_ERR "NILFS: GC failed during preparation: "
+ "cannot read source blocks: err=%d\n", ret);
+ else
+ ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+ if (ret < 0)
+ nilfs_remove_all_gcinode(nilfs);
+ clear_nilfs_gc_running(nilfs);
out_free:
while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..06713ffcc7f2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
goto failed_unlock;
err = -EEXIST;
- if (buffer_uptodate(bh) || buffer_mapped(bh))
+ if (buffer_uptodate(bh))
goto failed_bh;
-#if 0
- /* The uptodate flag is not protected by the page lock, but
- the mapped flag is. Thus, we don't have to wait the buffer. */
+
wait_on_buffer(bh);
if (buffer_uptodate(bh))
goto failed_bh;
-#endif
bh->b_bdev = nilfs->ns_bdev;
err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
int mode, struct buffer_head **out_bh)
{
struct buffer_head *bh;
- unsigned long blknum = 0;
+ __u64 blknum = 0;
int ret = -ENOMEM;
bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
unlock_buffer(bh);
goto out;
}
- if (!buffer_mapped(bh)) { /* unused buffer */
- ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
- &blknum);
- if (unlikely(ret)) {
- unlock_buffer(bh);
- goto failed_bh;
- }
- bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
- bh->b_blocknr = blknum;
- set_buffer_mapped(bh);
+
+ ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+ if (unlikely(ret)) {
+ unlock_buffer(bh);
+ goto failed_bh;
}
+ bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+ bh->b_blocknr = (sector_t)blknum;
+ set_buffer_mapped(bh);
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
@@ -191,7 +186,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
}
static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
- struct buffer_head **out_bh)
+ int readahead, struct buffer_head **out_bh)
{
struct buffer_head *first_bh, *bh;
unsigned long blkoff;
@@ -205,16 +200,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
if (unlikely(err))
goto failed;
- blkoff = block + 1;
- for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
- err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
- if (likely(!err || err == -EEXIST))
- brelse(bh);
- else if (err != -EBUSY)
- break; /* abort readahead if bmap lookup failed */
-
- if (!buffer_locked(first_bh))
- goto out_no_wait;
+ if (readahead) {
+ blkoff = block + 1;
+ for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+ err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+ if (likely(!err || err == -EEXIST))
+ brelse(bh);
+ else if (err != -EBUSY)
+ break;
+ /* abort readahead if bmap lookup failed */
+ if (!buffer_locked(first_bh))
+ goto out_no_wait;
+ }
}
wait_on_buffer(first_bh);
@@ -268,7 +265,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
/* Should be rewritten with merging nilfs_mdt_read_block() */
retry:
- ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+ ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
if (!create || ret != -ENOENT)
return ret;
@@ -376,7 +373,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
struct buffer_head *bh;
int err;
- err = nilfs_mdt_read_block(inode, block, &bh);
+ err = nilfs_mdt_read_block(inode, block, 0, &bh);
if (unlikely(err))
return err;
nilfs_mark_buffer_dirty(bh);
@@ -402,6 +399,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
struct inode *inode = container_of(page->mapping,
struct inode, i_data);
struct super_block *sb = inode->i_sb;
+ struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
struct nilfs_sb_info *writer = NULL;
int err = 0;
@@ -411,9 +409,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
if (page->mapping->assoc_mapping)
return 0; /* Do not request flush for shadow page cache */
if (!sb) {
- writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+ down_read(&nilfs->ns_writer_sem);
+ writer = nilfs->ns_writer;
if (!writer) {
- nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+ up_read(&nilfs->ns_writer_sem);
return -EROFS;
}
sb = writer->s_super;
@@ -425,18 +424,18 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
nilfs_flush_segment(sb, inode->i_ino);
if (writer)
- nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+ up_read(&nilfs->ns_writer_sem);
return err;
}
-static struct address_space_operations def_mdt_aops = {
+static const struct address_space_operations def_mdt_aops = {
.writepage = nilfs_mdt_write_page,
.sync_page = block_sync_page,
};
-static struct inode_operations def_mdt_iops;
-static struct file_operations def_mdt_fops;
+static const struct inode_operations def_mdt_iops;
+static const struct file_operations def_mdt_fops;
/*
* NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
@@ -448,9 +447,17 @@ static struct file_operations def_mdt_fops;
* longer than those of the super block structs; they may continue for
* several consecutive mounts/umounts. This would need discussions.
*/
+/**
+ * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
+ * @nilfs: nilfs object
+ * @sb: super block instance the metadata file belongs to
+ * @ino: inode number
+ * @gfp_mask: gfp mask for data pages
+ * @objsz: size of the private object attached to inode->i_private
+ */
struct inode *
nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
- ino_t ino, gfp_t gfp_mask)
+ ino_t ino, gfp_t gfp_mask, size_t objsz)
{
struct inode *inode = nilfs_alloc_inode_common(nilfs);
@@ -458,8 +465,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
return NULL;
else {
struct address_space * const mapping = &inode->i_data;
- struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+ struct nilfs_mdt_info *mi;
+ mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
if (!mi) {
nilfs_destroy_inode(inode);
return NULL;
@@ -516,10 +524,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
}
struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
- ino_t ino, gfp_t gfp_mask)
+ ino_t ino, size_t objsz)
{
- struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+ struct inode *inode;
+ inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
if (!inode)
return NULL;
@@ -546,14 +555,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
&NILFS_I(orig)->i_btnode_cache;
}
-void nilfs_mdt_clear(struct inode *inode)
+static void nilfs_mdt_clear(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
invalidate_mapping_pages(inode->i_mapping, 0, -1);
truncate_inode_pages(inode->i_mapping, 0);
- nilfs_bmap_clear(ii->i_bmap);
+ if (test_bit(NILFS_I_BMAP, &ii->i_state))
+ nilfs_bmap_clear(ii->i_bmap);
nilfs_btnode_cache_clear(&ii->i_btnode_cache);
}
@@ -561,6 +571,10 @@ void nilfs_mdt_destroy(struct inode *inode)
{
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+ if (mdi->mi_palloc_cache)
+ nilfs_palloc_destroy_cache(inode);
+ nilfs_mdt_clear(inode);
+
kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
kfree(mdi);
nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..6c4bbb0470fc 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
* @mi_entry_size: size of an entry
* @mi_first_entry_offset: offset to the first entry
* @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
* @mi_blocks_per_group: number of blocks in a group
* @mi_blocks_per_desc_block: number of blocks per descriptor block
*/
@@ -46,6 +47,7 @@ struct nilfs_mdt_info {
unsigned mi_entry_size;
unsigned mi_first_entry_offset;
unsigned long mi_entries_per_block;
+ struct nilfs_palloc_cache *mi_palloc_cache;
unsigned long mi_blocks_per_group;
unsigned long mi_blocks_per_desc_block;
};
@@ -75,11 +77,10 @@ int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);
struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
- gfp_t);
+ size_t);
struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
- ino_t, gfp_t);
+ ino_t, gfp_t, size_t);
void nilfs_mdt_destroy(struct inode *);
-void nilfs_mdt_clear(struct inode *);
void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
void nilfs_mdt_set_shadow(struct inode *, struct inode *);
@@ -105,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
#define nilfs_mdt_bgl_lock(inode, bg) \
(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
-
-static inline int
-nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
- unsigned n)
-{
- return nilfs_read_inode_common(
- inode, (struct nilfs_inode *)(bh->b_data + n));
-}
-
-static inline void
-nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
- unsigned n)
-{
- nilfs_write_inode_common(
- inode, (struct nilfs_inode *)(bh->b_data + n), 1);
-}
-
#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
return err;
}
-struct inode_operations nilfs_dir_inode_operations = {
+const struct inode_operations nilfs_dir_inode_operations = {
.create = nilfs_create,
.lookup = nilfs_lookup,
.link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
.permission = nilfs_permission,
};
-struct inode_operations nilfs_special_inode_operations = {
+const struct inode_operations nilfs_special_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
};
-struct inode_operations nilfs_symlink_inode_operations = {
+const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,13 +294,13 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
/*
* Inodes and files operations
*/
-extern struct file_operations nilfs_dir_operations;
-extern struct inode_operations nilfs_file_inode_operations;
-extern struct file_operations nilfs_file_operations;
-extern struct address_space_operations nilfs_aops;
-extern struct inode_operations nilfs_dir_inode_operations;
-extern struct inode_operations nilfs_special_inode_operations;
-extern struct inode_operations nilfs_symlink_inode_operations;
+extern const struct file_operations nilfs_dir_operations;
+extern const struct inode_operations nilfs_file_inode_operations;
+extern const struct file_operations nilfs_file_operations;
+extern const struct address_space_operations nilfs_aops;
+extern const struct inode_operations nilfs_dir_inode_operations;
+extern const struct inode_operations nilfs_special_inode_operations;
+extern const struct inode_operations nilfs_symlink_inode_operations;
/*
* filesystem type
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6d5412eff28f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
printk(KERN_WARNING
"NILFS warning: error recovering data block "
"(err=%d, ino=%lu, block-offset=%llu)\n",
- err, rb->ino, (unsigned long long)rb->blkoff);
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -769,14 +770,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
nilfs_finish_roll_forward(nilfs, sbi, ri);
}
- nilfs_detach_checkpoint(sbi);
- return 0;
-
failed:
nilfs_detach_checkpoint(sbi);
- nilfs_mdt_clear(nilfs->ns_cpfile);
- nilfs_mdt_clear(nilfs->ns_sufile);
- nilfs_mdt_clear(nilfs->ns_dat);
return err;
}
@@ -803,6 +798,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
struct nilfs_segsum_info ssi;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
+ sector_t b, end;
u64 seg_seq;
__u64 segnum, nextnum = 0;
__u64 cno;
@@ -818,6 +814,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
/* Calculate range of segment */
nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+ /* Read ahead segment */
+ b = seg_start;
+ while (b <= seg_end)
+ sb_breadahead(sbi->s_super, b++);
+
for (;;) {
/* Load segment summary */
ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
@@ -840,14 +841,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
ri->ri_nextnum = nextnum;
empty_seg = 0;
+ if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+ /* This will never happen because a superblock
+ (last_segment) always points to a pseg
+ having a super root. */
+ ret = NILFS_SEG_FAIL_CONSISTENCY;
+ goto failed;
+ }
+
+ if (pseg_start == seg_start) {
+ nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+ while (b <= end)
+ sb_breadahead(sbi->s_super, b++);
+ }
if (!NILFS_SEG_HAS_SR(&ssi)) {
- if (!scan_newer) {
- /* This will never happen because a superblock
- (last_segment) always points to a pseg
- having a super root. */
- ret = NILFS_SEG_FAIL_CONSISTENCY;
- goto failed;
- }
if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
ri->ri_lsegs_start = pseg_start;
ri->ri_lsegs_start_seq = seg_seq;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
{
struct bio *bio;
- bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+ bio = bio_alloc(GFP_NOIO, nr_vecs);
if (bio == NULL) {
while (!bio && (nr_vecs >>= 1))
- bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+ bio = bio_alloc(GFP_NOIO, nr_vecs);
}
if (likely(bio)) {
bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..097f9c467fef 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -974,12 +974,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
raw_sr->sr_flags = 0;
- nilfs_mdt_write_inode_direct(
- nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
- nilfs_mdt_write_inode_direct(
- nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
- nilfs_mdt_write_inode_direct(
- nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+ nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+ NILFS_SR_DAT_OFFSET(isz), 1);
+ nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
+ NILFS_SR_CPFILE_OFFSET(isz), 1);
+ nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+ NILFS_SR_SUFILE_OFFSET(isz), 1);
}
static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,21 +1273,6 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
return err;
}
-static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
-{
- struct buffer_head *bh_su;
- struct nilfs_segment_usage *raw_su;
- int err;
-
- err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
- if (unlikely(err))
- return err;
- nilfs_mdt_mark_buffer_dirty(bh_su);
- nilfs_mdt_mark_dirty(sufile);
- nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
- return 0;
-}
-
static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
@@ -1312,7 +1297,7 @@ static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
}
sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
- err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+ err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
if (unlikely(err))
return err;
@@ -1352,7 +1337,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
* not be dirty. The following call ensures that the buffer is dirty
* and will pin the buffer on memory until the sufile is written.
*/
- err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+ err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
if (unlikely(err))
return err;
@@ -1472,21 +1457,16 @@ static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
struct inode *sufile)
{
struct nilfs_segment_buffer *segbuf;
- struct buffer_head *bh_su;
- struct nilfs_segment_usage *raw_su;
unsigned long live_blocks;
int ret;
list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
- ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
- &raw_su, &bh_su);
- WARN_ON(ret); /* always succeed because bh_su is dirty */
live_blocks = segbuf->sb_sum.nblocks +
(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
- raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
- raw_su->su_nblocks = cpu_to_le32(live_blocks);
- nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
- bh_su);
+ ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+ live_blocks,
+ sci->sc_seg_ctime);
+ WARN_ON(ret); /* always succeed because the segusage is dirty */
}
}
@@ -1494,25 +1474,18 @@ static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
struct inode *sufile)
{
struct nilfs_segment_buffer *segbuf;
- struct buffer_head *bh_su;
- struct nilfs_segment_usage *raw_su;
int ret;
segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
- ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
- &raw_su, &bh_su);
- WARN_ON(ret); /* always succeed because bh_su is dirty */
- raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
- segbuf->sb_fseg_start);
- nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+ ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+ segbuf->sb_pseg_start -
+ segbuf->sb_fseg_start, 0);
+ WARN_ON(ret); /* always succeed because the segusage is dirty */
list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
- ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
- &raw_su, &bh_su);
+ ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+ 0, 0);
WARN_ON(ret); /* always succeed */
- raw_su->su_nblocks = 0;
- nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
- bh_su);
}
}
@@ -2468,17 +2441,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
/* Clear requests (even when the construction failed) */
spin_lock(&sci->sc_state_lock);
- sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
-
if (req->mode == SC_LSEG_SR) {
+ sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
sci->sc_seq_done = req->seq_accepted;
nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
sci->sc_flush_request = 0;
- } else if (req->mode == SC_FLUSH_FILE)
- sci->sc_flush_request &= ~FLUSH_FILE_BIT;
- else if (req->mode == SC_FLUSH_DAT)
- sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+ } else {
+ if (req->mode == SC_FLUSH_FILE)
+ sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+ else if (req->mode == SC_FLUSH_DAT)
+ sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+ /* re-enable timer if checkpoint creation was not done */
+ if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+ time_before(jiffies, sci->sc_timer->expires))
+ add_timer(sci->sc_timer);
+ }
spin_unlock(&sci->sc_state_lock);
}
@@ -2501,7 +2479,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
nilfs_discontinued(nilfs)) {
down_write(&nilfs->ns_sem);
- req->sb_err = nilfs_commit_super(sbi, 0);
+ req->sb_err = nilfs_commit_super(sbi,
+ nilfs_altsb_need_update(nilfs));
up_write(&nilfs->ns_sem);
}
}
@@ -2689,6 +2668,7 @@ static int nilfs_segctor_thread(void *arg)
} else {
DEFINE_WAIT(wait);
int should_sleep = 1;
+ struct the_nilfs *nilfs;
prepare_to_wait(&sci->sc_wait_daemon, &wait,
TASK_INTERRUPTIBLE);
@@ -2709,6 +2689,9 @@ static int nilfs_segctor_thread(void *arg)
finish_wait(&sci->sc_wait_daemon, &wait);
timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
time_after_eq(jiffies, sci->sc_timer->expires));
+ nilfs = sci->sc_sbi->s_nilfs;
+ if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+ set_nilfs_discontinued(nilfs);
}
goto loop;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4a59cc..b6c36d0cc331 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -31,6 +31,16 @@
#include "sufile.h"
+struct nilfs_sufile_info {
+ struct nilfs_mdt_info mi;
+ unsigned long ncleansegs;
+};
+
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+ return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
+
static inline unsigned long
nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
{
@@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
max - curr + 1);
}
-static inline struct nilfs_sufile_header *
-nilfs_sufile_block_get_header(const struct inode *sufile,
- struct buffer_head *bh,
- void *kaddr)
-{
- return kaddr + bh_offset(bh);
-}
-
static struct nilfs_segment_usage *
nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
}
/**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+ return NILFS_SUI(sufile)->ncleansegs;
+}
+
+/**
* nilfs_sufile_updatev - modify multiple segment usages at a time
* @sufile: inode of segment usage file
* @segnumv: array of segment numbers
@@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
if (ret < 0)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
- header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+ header = kaddr + bh_offset(header_bh);
ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
kunmap_atomic(kaddr, KM_USER0);
kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
- header = nilfs_sufile_block_get_header(
- sufile, header_bh, kaddr);
+ header = kaddr + bh_offset(header_bh);
le64_add_cpu(&header->sh_ncleansegs, -1);
le64_add_cpu(&header->sh_ndirtysegs, 1);
header->sh_last_alloc = cpu_to_le64(segnum);
kunmap_atomic(kaddr, KM_USER0);
+ NILFS_SUI(sufile)->ncleansegs--;
nilfs_mdt_mark_buffer_dirty(header_bh);
nilfs_mdt_mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kunmap_atomic(kaddr, KM_USER0);
nilfs_sufile_mod_counter(header_bh, -1, 1);
+ NILFS_SUI(sufile)->ncleansegs--;
+
nilfs_mdt_mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
@@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
kunmap_atomic(kaddr, KM_USER0);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+ NILFS_SUI(sufile)->ncleansegs -= clean;
+
nilfs_mdt_mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
@@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
nilfs_mdt_mark_buffer_dirty(su_bh);
nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+ NILFS_SUI(sufile)->ncleansegs++;
+
nilfs_mdt_mark_dirty(sufile);
}
/**
- * nilfs_sufile_get_segment_usage - get a segment usage
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
* @sufile: inode of segment usage file
* @segnum: segment number
- * @sup: pointer to segment usage
- * @bhp: pointer to buffer head
- *
- * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
- * specified by @segnum.
- *
- * Return Value: On success, 0 is returned, and the segment usage and the
- * buffer head of the buffer on which the segment usage is located are stored
- * in the place pointed by @sup and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
*/
-int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
- struct nilfs_segment_usage **sup,
- struct buffer_head **bhp)
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
struct buffer_head *bh;
- struct nilfs_segment_usage *su;
- void *kaddr;
int ret;
- /* segnum is 0 origin */
- if (segnum >= nilfs_sufile_get_nsegments(sufile))
- return -EINVAL;
- down_write(&NILFS_MDT(sufile)->mi_sem);
- ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
- if (ret < 0)
- goto out_sem;
- kaddr = kmap(bh->b_page);
- su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
- if (nilfs_segment_usage_error(su)) {
- kunmap(bh->b_page);
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+ if (!ret) {
+ nilfs_mdt_mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(sufile);
brelse(bh);
- ret = -EINVAL;
- goto out_sem;
}
-
- if (sup != NULL)
- *sup = su;
- *bhp = bh;
-
- out_sem:
- up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
/**
- * nilfs_sufile_put_segment_usage - put a segment usage
+ * nilfs_sufile_set_segment_usage - set usage of a segment
* @sufile: inode of segment usage file
* @segnum: segment number
- * @bh: buffer head
- *
- * Description: nilfs_sufile_put_segment_usage() releases the segment usage
- * specified by @segnum. @bh must be the buffer head which have been returned
- * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ * @nblocks: number of live blocks in the segment
+ * @modtime: modification time (option)
*/
-void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
- struct buffer_head *bh)
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+ unsigned long nblocks, time_t modtime)
{
- kunmap(bh->b_page);
+ struct buffer_head *bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ int ret;
+
+ down_write(&NILFS_MDT(sufile)->mi_sem);
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+ if (ret < 0)
+ goto out_sem;
+
+ kaddr = kmap_atomic(bh->b_page, KM_USER0);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ WARN_ON(nilfs_segment_usage_error(su));
+ if (modtime)
+ su->su_lastmod = cpu_to_le64(modtime);
+ su->su_nblocks = cpu_to_le32(nblocks);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ nilfs_mdt_mark_buffer_dirty(bh);
+ nilfs_mdt_mark_dirty(sufile);
brelse(bh);
+
+ out_sem:
+ up_write(&NILFS_MDT(sufile)->mi_sem);
+ return ret;
}
/**
@@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
- header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+ header = kaddr + bh_offset(header_bh);
sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
return ret;
}
-/**
- * nilfs_sufile_get_ncleansegs - get the number of clean segments
- * @sufile: inode of segment usage file
- * @nsegsp: pointer to the number of clean segments
- *
- * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
- * segments.
- *
- * Return Value: On success, 0 is returned and the number of clean segments is
- * stored in the place pointed by @nsegsp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
-{
- struct nilfs_sustat sustat;
- int ret;
-
- ret = nilfs_sufile_get_stat(sufile, &sustat);
- if (ret == 0)
- *nsegsp = sustat.ss_ncleansegs;
- return ret;
-}
-
void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
struct buffer_head *header_bh,
struct buffer_head *su_bh)
@@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
nilfs_segment_usage_set_error(su);
kunmap_atomic(kaddr, KM_USER0);
- if (suclean)
+ if (suclean) {
nilfs_sufile_mod_counter(header_bh, -1, 0);
+ NILFS_SUI(sufile)->ncleansegs--;
+ }
nilfs_mdt_mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
@@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
up_read(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
+
+/**
+ * nilfs_sufile_read - read sufile inode
+ * @sufile: sufile inode
+ * @raw_inode: on-disk sufile inode
+ */
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+{
+ struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+ struct buffer_head *header_bh;
+ struct nilfs_sufile_header *header;
+ void *kaddr;
+ int ret;
+
+ ret = nilfs_read_inode_common(sufile, raw_inode);
+ if (ret < 0)
+ return ret;
+
+ ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+ if (!ret) {
+ kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ header = kaddr + bh_offset(header_bh);
+ sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+ kunmap_atomic(kaddr, KM_USER0);
+ brelse(header_bh);
+ }
+ return ret;
+}
+
+/**
+ * nilfs_sufile_new - create sufile
+ * @nilfs: nilfs object
+ * @susize: size of a segment usage entry
+ */
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+{
+ struct inode *sufile;
+
+ sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+ sizeof(struct nilfs_sufile_info));
+ if (sufile)
+ nilfs_mdt_set_entry_size(sufile, susize,
+ sizeof(struct nilfs_sufile_header));
+ return sufile;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..15163b8aff7d 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,21 +28,19 @@
#include <linux/nilfs2_fs.h>
#include "mdt.h"
-#define NILFS_SUFILE_GFP NILFS_MDT_GFP
static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
{
return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
}
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+
int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_get_segment_usage(struct inode *, __u64,
- struct nilfs_segment_usage **,
- struct buffer_head **);
-void nilfs_sufile_put_segment_usage(struct inode *, __u64,
- struct buffer_head *);
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+ unsigned long nblocks, time_t modtime);
int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
size_t);
@@ -63,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
struct buffer_head *);
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+
/**
* nilfs_sufile_scrap - make a segment garbage
* @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..5403b3ef3a42 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
#include <linux/writeback.h>
#include <linux/kobject.h>
#include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
#include "nilfs.h"
#include "mdt.h"
#include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
"(NILFS)");
MODULE_LICENSE("GPL");
-static void nilfs_write_super(struct super_block *sb);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
/**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
lock_kernel();
- if (sb->s_dirt)
- nilfs_write_super(sb);
-
nilfs_detach_segment_constructor(sbi);
if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
unlock_kernel();
}
-/**
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt. This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs. Lock order must be as follows:
- *
- * 1. lock_super()
- * 2. down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods. So we must be careful not to bring on deadlocks when using
- * lock_super(); see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS. We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
+static int nilfs_sync_fs(struct super_block *sb, int wait)
{
struct nilfs_sb_info *sbi = NILFS_SB(sb);
struct the_nilfs *nilfs = sbi->s_nilfs;
-
- down_write(&nilfs->ns_sem);
- if (!(sb->s_flags & MS_RDONLY)) {
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
- u64 t = get_seconds();
- int dupsb;
-
- if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
- t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
- up_write(&nilfs->ns_sem);
- return;
- }
- dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
- nilfs_commit_super(sbi, dupsb);
- }
- sb->s_dirt = 0;
- up_write(&nilfs->ns_sem);
-}
-
-static int nilfs_sync_fs(struct super_block *sb, int wait)
-{
int err = 0;
- nilfs_write_super(sb);
-
/* This function is called when super block should be written back */
if (wait)
err = nilfs_construct_segment(sb);
+
+ down_write(&nilfs->ns_sem);
+ if (sb->s_dirt)
+ nilfs_commit_super(sbi, 1);
+ up_write(&nilfs->ns_sem);
+
return err;
}
@@ -407,15 +363,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
list_add(&sbi->s_list, &nilfs->ns_supers);
up_write(&nilfs->ns_super_sem);
- sbi->s_ifile = nilfs_mdt_new(
- nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+ sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
if (!sbi->s_ifile)
return -ENOMEM;
- err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
- if (unlikely(err))
- goto failed;
-
down_read(&nilfs->ns_segctor_sem);
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
&bh_cp);
@@ -456,7 +407,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
{
struct the_nilfs *nilfs = sbi->s_nilfs;
- nilfs_mdt_clear(sbi->s_ifile);
nilfs_mdt_destroy(sbi->s_ifile);
sbi->s_ifile = NULL;
down_write(&nilfs->ns_super_sem);
@@ -464,22 +414,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
up_write(&nilfs->ns_super_sem);
}
-static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
-{
- struct the_nilfs *nilfs = sbi->s_nilfs;
- int err = 0;
-
- down_write(&nilfs->ns_sem);
- if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
- nilfs->ns_mount_state |= NILFS_VALID_FS;
- err = nilfs_commit_super(sbi, 1);
- if (likely(!err))
- printk(KERN_INFO "NILFS: recovery complete.\n");
- }
- up_write(&nilfs->ns_sem);
- return err;
-}
-
static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -529,7 +463,29 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct super_operations nilfs_sops = {
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct super_block *sb = vfs->mnt_sb;
+ struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+ if (!nilfs_test_opt(sbi, BARRIER))
+ seq_printf(seq, ",nobarrier");
+ if (nilfs_test_opt(sbi, SNAPSHOT))
+ seq_printf(seq, ",cp=%llu",
+ (unsigned long long int)sbi->s_snapshot_cno);
+ if (nilfs_test_opt(sbi, ERRORS_RO))
+ seq_printf(seq, ",errors=remount-ro");
+ if (nilfs_test_opt(sbi, ERRORS_PANIC))
+ seq_printf(seq, ",errors=panic");
+ if (nilfs_test_opt(sbi, STRICT_ORDER))
+ seq_printf(seq, ",order=strict");
+ if (nilfs_test_opt(sbi, NORECOVERY))
+ seq_printf(seq, ",norecovery");
+
+ return 0;
+}
+
+static const struct super_operations nilfs_sops = {
.alloc_inode = nilfs_alloc_inode,
.destroy_inode = nilfs_destroy_inode,
.dirty_inode = nilfs_dirty_inode,
@@ -538,7 +494,7 @@ static struct super_operations nilfs_sops = {
/* .drop_inode = nilfs_drop_inode, */
.delete_inode = nilfs_delete_inode,
.put_super = nilfs_put_super,
- .write_super = nilfs_write_super,
+ /* .write_super = nilfs_write_super, */
.sync_fs = nilfs_sync_fs,
/* .write_super_lockfs */
/* .unlockfs */
@@ -546,7 +502,7 @@ static struct super_operations nilfs_sops = {
.remount_fs = nilfs_remount,
.clear_inode = nilfs_clear_inode,
/* .umount_begin */
- /* .show_options */
+ .show_options = nilfs_show_options
};
static struct inode *
@@ -585,7 +541,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
nilfs_nfs_get_inode);
}
-static struct export_operations nilfs_export_ops = {
+static const struct export_operations nilfs_export_ops = {
.fh_to_dentry = nilfs_fh_to_dentry,
.fh_to_parent = nilfs_fh_to_parent,
.get_parent = nilfs_get_parent,
@@ -593,7 +549,7 @@ static struct export_operations nilfs_export_ops = {
enum {
Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_barrier, Opt_snapshot, Opt_order,
+ Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
Opt_err,
};
@@ -601,25 +557,13 @@ static match_table_t tokens = {
{Opt_err_cont, "errors=continue"},
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
- {Opt_barrier, "barrier=%s"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_snapshot, "cp=%u"},
{Opt_order, "order=%s"},
+ {Opt_norecovery, "norecovery"},
{Opt_err, NULL}
};
-static int match_bool(substring_t *s, int *result)
-{
- int len = s->to - s->from;
-
- if (strncmp(s->from, "on", len) == 0)
- *result = 1;
- else if (strncmp(s->from, "off", len) == 0)
- *result = 0;
- else
- return 1;
- return 0;
-}
-
static int parse_options(char *options, struct super_block *sb)
{
struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -637,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb)
token = match_token(p, tokens, args);
switch (token) {
- case Opt_barrier:
- if (match_bool(&args[0], &option))
- return 0;
- if (option)
- nilfs_set_opt(sbi, BARRIER);
- else
- nilfs_clear_opt(sbi, BARRIER);
+ case Opt_nobarrier:
+ nilfs_clear_opt(sbi, BARRIER);
break;
case Opt_order:
if (strcmp(args[0].from, "relaxed") == 0)
@@ -672,6 +611,9 @@ static int parse_options(char *options, struct super_block *sb)
sbi->s_snapshot_cno = option;
nilfs_set_opt(sbi, SNAPSHOT);
break;
+ case Opt_norecovery:
+ nilfs_set_opt(sbi, NORECOVERY);
+ break;
default:
printk(KERN_ERR
"NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -697,9 +639,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
int mnt_count = le16_to_cpu(sbp->s_mnt_count);
/* nilfs->sem must be locked by the caller. */
- if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
- printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
- } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+ if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
printk(KERN_WARNING
"NILFS warning: mounting fs with errors\n");
#if 0
@@ -807,19 +747,23 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
sb->s_root = NULL;
sb->s_time_gran = 1;
- if (!nilfs_loaded(nilfs)) {
- err = load_nilfs(nilfs, sbi);
- if (err)
- goto failed_sbi;
- }
+ err = load_nilfs(nilfs, sbi);
+ if (err)
+ goto failed_sbi;
+
cno = nilfs_last_cno(nilfs);
if (sb->s_flags & MS_RDONLY) {
if (nilfs_test_opt(sbi, SNAPSHOT)) {
+ down_read(&nilfs->ns_segctor_sem);
err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
sbi->s_snapshot_cno);
- if (err < 0)
+ up_read(&nilfs->ns_segctor_sem);
+ if (err < 0) {
+ if (err == -ENOENT)
+ err = -EINVAL;
goto failed_sbi;
+ }
if (!err) {
printk(KERN_ERR
"NILFS: The specified checkpoint is "
@@ -874,12 +818,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
up_write(&nilfs->ns_sem);
}
- err = nilfs_mark_recovery_complete(sbi);
- if (unlikely(err)) {
- printk(KERN_ERR "NILFS: recovery failed.\n");
- goto failed_root;
- }
-
down_write(&nilfs->ns_super_sem);
if (!nilfs_test_opt(sbi, SNAPSHOT))
nilfs->ns_current = sbi;
@@ -887,10 +825,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
return 0;
- failed_root:
- dput(sb->s_root);
- sb->s_root = NULL;
-
failed_segctor:
nilfs_detach_segment_constructor(sbi);
@@ -935,6 +869,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (!nilfs_valid_fs(nilfs)) {
+ printk(KERN_WARNING "NILFS (device %s): couldn't "
+ "remount because the filesystem is in an "
+ "incomplete recovery state.\n", sb->s_id);
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
goto out;
if (*flags & MS_RDONLY) {
@@ -1127,10 +1069,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
*/
sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
- if (!sd.cno)
- /* trying to get the latest checkpoint. */
- sd.cno = nilfs_last_cno(nilfs);
-
/*
* Get super block instance holding the nilfs_sb_info struct.
* A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..6241e1722efc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
nilfs->ns_bdev = bdev;
atomic_set(&nilfs->ns_count, 1);
- atomic_set(&nilfs->ns_writer_refcount, -1);
atomic_set(&nilfs->ns_ndirtyblks, 0);
init_rwsem(&nilfs->ns_sem);
init_rwsem(&nilfs->ns_super_sem);
mutex_init(&nilfs->ns_mount_mutex);
- mutex_init(&nilfs->ns_writer_mutex);
+ init_rwsem(&nilfs->ns_writer_sem);
INIT_LIST_HEAD(&nilfs->ns_list);
INIT_LIST_HEAD(&nilfs->ns_supers);
spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -147,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs)
might_sleep();
if (nilfs_loaded(nilfs)) {
- nilfs_mdt_clear(nilfs->ns_sufile);
nilfs_mdt_destroy(nilfs->ns_sufile);
- nilfs_mdt_clear(nilfs->ns_cpfile);
nilfs_mdt_destroy(nilfs->ns_cpfile);
- nilfs_mdt_clear(nilfs->ns_dat);
nilfs_mdt_destroy(nilfs->ns_dat);
- /* XXX: how and when to clear nilfs->ns_gc_dat? */
nilfs_mdt_destroy(nilfs->ns_gc_dat);
}
if (nilfs_init(nilfs)) {
@@ -167,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs)
static int nilfs_load_super_root(struct the_nilfs *nilfs,
struct nilfs_sb_info *sbi, sector_t sr_block)
{
- static struct lock_class_key dat_lock_key;
struct buffer_head *bh_sr;
struct nilfs_super_root *raw_sr;
struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -188,55 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
inode_size = nilfs->ns_inode_size;
err = -ENOMEM;
- nilfs->ns_dat = nilfs_mdt_new(
- nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+ nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
if (unlikely(!nilfs->ns_dat))
goto failed;
- nilfs->ns_gc_dat = nilfs_mdt_new(
- nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+ nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
if (unlikely(!nilfs->ns_gc_dat))
goto failed_dat;
- nilfs->ns_cpfile = nilfs_mdt_new(
- nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+ nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
if (unlikely(!nilfs->ns_cpfile))
goto failed_gc_dat;
- nilfs->ns_sufile = nilfs_mdt_new(
- nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+ nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
if (unlikely(!nilfs->ns_sufile))
goto failed_cpfile;
- err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
- if (unlikely(err))
- goto failed_sufile;
-
- err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
- if (unlikely(err))
- goto failed_sufile;
-
- lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
- lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
-
nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
- nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
- sizeof(struct nilfs_cpfile_header));
- nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
- sizeof(struct nilfs_sufile_header));
- err = nilfs_mdt_read_inode_direct(
- nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+ err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
+ NILFS_SR_DAT_OFFSET(inode_size));
if (unlikely(err))
goto failed_sufile;
- err = nilfs_mdt_read_inode_direct(
- nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+ err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
+ NILFS_SR_CPFILE_OFFSET(inode_size));
if (unlikely(err))
goto failed_sufile;
- err = nilfs_mdt_read_inode_direct(
- nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+ err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
+ NILFS_SR_SUFILE_OFFSET(inode_size));
if (unlikely(err))
goto failed_sufile;
@@ -286,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
struct nilfs_recovery_info ri;
unsigned int s_flags = sbi->s_super->s_flags;
int really_read_only = bdev_read_only(nilfs->ns_bdev);
- unsigned valid_fs;
- int err = 0;
-
- nilfs_init_recovery_info(&ri);
+ int valid_fs = nilfs_valid_fs(nilfs);
+ int err;
- down_write(&nilfs->ns_sem);
- valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
- up_write(&nilfs->ns_sem);
+ if (nilfs_loaded(nilfs)) {
+ if (valid_fs ||
+ ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
+ return 0;
+ printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
+ "recovery state.\n");
+ return -EINVAL;
+ }
- if (!valid_fs && (s_flags & MS_RDONLY)) {
- printk(KERN_INFO "NILFS: INFO: recovery "
- "required for readonly filesystem.\n");
- if (really_read_only) {
- printk(KERN_ERR "NILFS: write access "
- "unavailable, cannot proceed.\n");
- err = -EROFS;
- goto failed;
+ if (!valid_fs) {
+ printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+ if (s_flags & MS_RDONLY) {
+ printk(KERN_INFO "NILFS: INFO: recovery "
+ "required for readonly filesystem.\n");
+ printk(KERN_INFO "NILFS: write access will "
+ "be enabled during recovery.\n");
}
- printk(KERN_INFO "NILFS: write access will "
- "be enabled during recovery.\n");
- sbi->s_super->s_flags &= ~MS_RDONLY;
}
+ nilfs_init_recovery_info(&ri);
+
err = nilfs_search_super_root(nilfs, sbi, &ri);
if (unlikely(err)) {
printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -321,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
goto failed;
}
- if (!valid_fs) {
- err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
- if (unlikely(err)) {
- nilfs_mdt_destroy(nilfs->ns_cpfile);
- nilfs_mdt_destroy(nilfs->ns_sufile);
- nilfs_mdt_destroy(nilfs->ns_dat);
- goto failed;
+ if (valid_fs)
+ goto skip_recovery;
+
+ if (s_flags & MS_RDONLY) {
+ if (nilfs_test_opt(sbi, NORECOVERY)) {
+ printk(KERN_INFO "NILFS: norecovery option specified. "
+ "skipping roll-forward recovery\n");
+ goto skip_recovery;
}
- if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
- sbi->s_super->s_dirt = 1;
+ if (really_read_only) {
+ printk(KERN_ERR "NILFS: write access "
+ "unavailable, cannot proceed.\n");
+ err = -EROFS;
+ goto failed_unload;
+ }
+ sbi->s_super->s_flags &= ~MS_RDONLY;
+ } else if (nilfs_test_opt(sbi, NORECOVERY)) {
+ printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+ "option was specified for a read/write mount\n");
+ err = -EINVAL;
+ goto failed_unload;
}
+ err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+ if (err)
+ goto failed_unload;
+
+ down_write(&nilfs->ns_sem);
+ nilfs->ns_mount_state |= NILFS_VALID_FS;
+ nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+ err = nilfs_commit_super(sbi, 1);
+ up_write(&nilfs->ns_sem);
+
+ if (err) {
+ printk(KERN_ERR "NILFS: failed to update super block. "
+ "recovery unfinished.\n");
+ goto failed_unload;
+ }
+ printk(KERN_INFO "NILFS: recovery complete.\n");
+
+ skip_recovery:
set_nilfs_loaded(nilfs);
+ nilfs_clear_recovery_info(&ri);
+ sbi->s_super->s_flags = s_flags;
+ return 0;
+
+ failed_unload:
+ nilfs_mdt_destroy(nilfs->ns_cpfile);
+ nilfs_mdt_destroy(nilfs->ns_sufile);
+ nilfs_mdt_destroy(nilfs->ns_dat);
failed:
nilfs_clear_recovery_info(&ri);
@@ -596,9 +609,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
- bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
/* Finding last segment */
@@ -639,30 +650,23 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
{
struct inode *dat = nilfs_dat_inode(nilfs);
unsigned long ncleansegs;
- int err;
down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
- err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+ ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
- if (likely(!err))
- *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
- return err;
+ *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+ return 0;
}
int nilfs_near_disk_full(struct the_nilfs *nilfs)
{
- struct inode *sufile = nilfs->ns_sufile;
unsigned long ncleansegs, nincsegs;
- int ret;
- ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
- if (likely(!ret)) {
- nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
- nilfs->ns_blocks_per_segment + 1;
- if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
- ret++;
- }
- return ret;
+ ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+ nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+ nilfs->ns_blocks_per_segment + 1;
+
+ return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
}
/**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..589786e33464 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
the latest checkpoint was loaded */
THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+ THE_NILFS_GC_RUNNING, /* gc process is running */
};
/**
@@ -50,8 +51,7 @@ enum {
* @ns_sem: semaphore for shared states
* @ns_super_sem: semaphore for global operations across super block instances
* @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
* @ns_current: back pointer to current mount
* @ns_sbh: buffer heads of on-disk super blocks
* @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
struct rw_semaphore ns_sem;
struct rw_semaphore ns_super_sem;
struct mutex ns_mount_mutex;
- struct mutex ns_writer_mutex;
- atomic_t ns_writer_refcount;
+ struct rw_semaphore ns_writer_sem;
/*
* components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
THE_NILFS_FNS(INIT, init)
THE_NILFS_FNS(LOADED, loaded)
THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
/* Minimum interval of periodical update of superblocks (in seconds) */
#define NILFS_SB_FREQ 10
#define NILFS_ALTSB_FREQ 60 /* spare superblock */
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+ u64 t = get_seconds();
+ return t < nilfs->ns_sbwtime[0] ||
+ t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+ u64 t = get_seconds();
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
+
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
struct the_nilfs *find_or_create_nilfs(struct block_device *);
void put_nilfs(struct the_nilfs *);
@@ -221,34 +235,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
atomic_inc(&nilfs->ns_count);
}
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
- if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
- mutex_lock(&nilfs->ns_writer_mutex);
- return nilfs->ns_writer;
-}
-
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
- if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
- mutex_unlock(&nilfs->ns_writer_mutex);
-}
-
static inline void
nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
{
- mutex_lock(&nilfs->ns_writer_mutex);
+ down_write(&nilfs->ns_writer_sem);
nilfs->ns_writer = sbi;
- mutex_unlock(&nilfs->ns_writer_mutex);
+ up_write(&nilfs->ns_writer_sem);
}
static inline void
nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
{
- mutex_lock(&nilfs->ns_writer_mutex);
+ down_write(&nilfs->ns_writer_sem);
if (sbi == nilfs->ns_writer)
nilfs->ns_writer = NULL;
- mutex_unlock(&nilfs->ns_writer_mutex);
+ up_write(&nilfs->ns_writer_sem);
}
static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
@@ -257,6 +258,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
kfree(sbi);
}
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+ unsigned valid_fs;
+
+ down_read(&nilfs->ns_sem);
+ valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+ up_read(&nilfs->ns_sem);
+ return valid_fs;
+}
+
static inline void
nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d83b31..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
while (*s && len > 0) {
if (*s & 0x80) {
size = utf8_to_utf32(s, len, &u);
- if (size < 0) {
- /* Ignore character and move on */
- size = 1;
- } else if (u >= PLANE_SIZE) {
+ if (size < 0)
+ return -EINVAL;
+
+ if (u >= PLANE_SIZE) {
u -= PLANE_SIZE;
*op++ = (wchar_t) (SURROGATE_PAIR |
((u >> 10) & SURROGATE_BITS));
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
void unload_nls(struct nls_table *nls)
{
- module_put(nls->owner);
+ if (nls)
+ module_put(nls->owner);
}
static const wchar_t charset2uni[256] = {
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 828a889be909..7e54e52964dd 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct fown_struct *fown;
+ __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
to_tell = event->to_tell;
@@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
spin_lock(&entry->lock);
prev = &dnentry->dn;
while ((dn = *prev) != NULL) {
- if ((dn->dn_mask & event->mask) == 0) {
+ if ((dn->dn_mask & test_mask) == 0) {
prev = &dn->dn_next;
continue;
}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index c8a07c65482b..3165d85aada2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
spin_lock(&group->mark_lock);
spin_lock(&inode->i_lock);
- entry->group = group;
- entry->inode = inode;
-
lentry = fsnotify_find_mark_entry(group, inode);
if (!lentry) {
+ entry->group = group;
+ entry->inode = inode;
+
hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
list_add(&entry->g_list, &group->mark_entries);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3816d5750dd5..b8bf53b4c108 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
/* remember, after old was put on the wait_q we aren't
* allowed to look at the inode any more, only thing
* left to check was if the file_name is the same */
- if (old->name_len &&
+ if (!old->name_len ||
!strcmp(old->file_name, new->file_name))
return true;
break;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
+ .error_remove_page = generic_error_remove_page,
};
/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
+ .error_remove_page = generic_error_remove_page,
};
#ifdef NTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..663c0e341f8b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
- /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
- if (likely(!status)) {
- if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
- if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
- status = generic_osync_inode(vi, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- }
- }
pagevec_lru_add_file(&lru_pvec);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err = sync_page_range(inode, mapping, pos, ret);
+ if (ret > 0) {
+ int err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
}
@@ -2154,46 +2146,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
}
/**
- * ntfs_file_writev -
- *
- * Basically the same as generic_file_writev() except that it ends up calling
- * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
- */
-static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct kiocb kiocb;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
- init_sync_kiocb(&kiocb, file);
- ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
- mutex_unlock(&inode->i_mutex);
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err = sync_page_range(inode, mapping, *ppos - ret, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-
-/**
- * ntfs_file_write - simple wrapper for ntfs_file_writev()
- */
-static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
-
- return ntfs_file_writev(file, &local_iov, 1, ppos);
-}
-
-/**
* ntfs_file_fsync - sync a file to disk
* @filp: file to be synced
* @dentry: dentry describing the file to sync
@@ -2255,7 +2207,7 @@ const struct file_operations ntfs_file_ops = {
.read = do_sync_read, /* Read from file. */
.aio_read = generic_file_aio_read, /* Async read from file. */
#ifdef NTFS_RW
- .write = ntfs_file_write, /* Write to file. */
+ .write = do_sync_write, /* Write to file. */
.aio_write = ntfs_file_aio_write, /* Async write to file. */
/*.release = ,*/ /* Last file is closed. See
fs/ext2/file.c::
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 50931b1ce4b9..8b2549f672bf 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -829,7 +829,7 @@ enum {
/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
- is used to to obtain all flags that are valid for setting. */
+ is used to obtain all flags that are valid for setting. */
/*
* The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
* FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
/* return (void *)__get_free_page(gfp_mask); */
}
- if (likely(size >> PAGE_SHIFT < num_physpages))
+ if (likely((size >> PAGE_SHIFT) < totalram_pages))
return __vmalloc(size, gfp_mask, PAGE_KERNEL);
return NULL;
}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
- * happen or the file data would not necessarily hit the device synchronously,
- * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
- * would suggest.
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
*/
void __mark_mft_record_dirty(ntfs_inode *ni)
{
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index abaaa1cbf8de..80b04770e8e9 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -201,8 +201,7 @@ use_utf8:
v, old_nls->charset);
nls_map = old_nls;
} else /* nls_map */ {
- if (old_nls)
- unload_nls(old_nls);
+ unload_nls(old_nls);
}
} else if (!strcmp(p, "utf8")) {
bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
ntfs_free(vol->upcase);
vol->upcase = NULL;
}
- if (vol->nls_map) {
- unload_nls(vol->nls_map);
- vol->nls_map = NULL;
- }
+
+ unload_nls(vol->nls_map);
+
sb->s_fs_info = NULL;
kfree(vol);
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872e..0d840669698e 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -6,6 +6,7 @@ config OCFS2_FS
select CRC32
select QUOTA
select QUOTA_TREE
+ select FS_POSIX_ACL
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
@@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS
This option will enable expensive consistency checks. Enable
this option for debugging only as it is likely to decrease
performance of the filesystem.
-
-config OCFS2_FS_POSIX_ACL
- bool "OCFS2 POSIX Access Control Lists"
- depends on OCFS2_FS
- select FS_POSIX_ACL
- default n
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 01596079dd63..600d2d2ade11 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
locks.o \
mmap.o \
namei.o \
+ refcounttree.o \
resize.o \
slot_map.o \
suballoc.o \
@@ -38,11 +39,8 @@ ocfs2-objs := \
ver.o \
quota_local.o \
quota_global.o \
- xattr.o
-
-ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
-ocfs2-objs += acl.o
-endif
+ xattr.o \
+ acl.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da5..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,8 +26,6 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
-
extern int ocfs2_check_acl(struct inode *, int);
extern int ocfs2_acl_chmod(struct inode *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
@@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
struct ocfs2_alloc_context *,
struct ocfs2_alloc_context *);
-#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
-
-#define ocfs2_check_acl NULL
-static inline int ocfs2_acl_chmod(struct inode *inode)
-{
- return 0;
-}
-static inline int ocfs2_init_acl(handle_t *handle,
- struct inode *inode,
- struct inode *dir,
- struct buffer_head *di_bh,
- struct buffer_head *dir_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac)
-{
- return 0;
-}
-
-#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
-
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab513ddaeff2..38a42f5d59ff 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,10 +49,21 @@
#include "super.h"
#include "uptodate.h"
#include "xattr.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
+enum ocfs2_contig_type {
+ CONTIG_NONE = 0,
+ CONTIG_LEFT,
+ CONTIG_RIGHT,
+ CONTIG_LEFTRIGHT,
+};
+static enum ocfs2_contig_type
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
/*
* Operations for a specific extent tree type.
*
@@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations {
* that value. new_clusters is the delta, and must be
* added to the total. Required.
*/
- void (*eo_update_clusters)(struct inode *inode,
- struct ocfs2_extent_tree *et,
+ void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
u32 new_clusters);
/*
+ * If this extent tree is supported by an extent map, insert
+ * a record into the map.
+ */
+ void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+
+ /*
+ * If this extent tree is supported by an extent map, truncate the
+ * map to clusters,
+ */
+ void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
+ u32 clusters);
+
+ /*
* If ->eo_insert_check() exists, it is called before rec is
* inserted into the extent tree. It is optional.
*/
- int (*eo_insert_check)(struct inode *inode,
- struct ocfs2_extent_tree *et,
+ int (*eo_insert_check)(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
- int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+ int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
/*
* --------------------------------------------------------------
@@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations {
* it exists. If it does not, et->et_max_leaf_clusters is set
* to 0 (unlimited). Optional.
*/
- void (*eo_fill_max_leaf_clusters)(struct inode *inode,
- struct ocfs2_extent_tree *et);
+ void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+ * are contiguous or not. Optional. Don't need to set it if use
+ * ocfs2_extent_rec as the tree leaf.
+ */
+ enum ocfs2_contig_type
+ (*eo_extent_contig)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
};
@@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations {
static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 blkno);
-static void ocfs2_dinode_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters);
-static int ocfs2_dinode_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters);
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
-static int ocfs2_dinode_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et);
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
+ .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
+ .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
.eo_insert_check = ocfs2_dinode_insert_check,
.eo_sanity_check = ocfs2_dinode_sanity_check,
.eo_fill_root_el = ocfs2_dinode_fill_root_el,
@@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(di->i_last_eb_blk);
}
-static void ocfs2_dinode_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
struct ocfs2_dinode *di = et->et_object;
le32_add_cpu(&di->i_clusters, clusters);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ spin_lock(&oi->ip_lock);
+ oi->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&oi->ip_lock);
}
-static int ocfs2_dinode_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_insert_rec(inode, rec);
+}
+
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_trunc(inode, clusters);
+}
+
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+ struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
- BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+ BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
- (OCFS2_I(inode)->ip_clusters !=
- le32_to_cpu(rec->e_cpos)),
+ (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
"Device %s, asking for sparse allocation: inode %llu, "
"cpos %u, clusters %u\n",
osb->dev_str,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- rec->e_cpos,
- OCFS2_I(inode)->ip_clusters);
+ (unsigned long long)oi->ip_blkno,
+ rec->e_cpos, oi->ip_clusters);
return 0;
}
-static int ocfs2_dinode_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
{
struct ocfs2_dinode *di = et->et_object;
@@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
}
-static void ocfs2_xattr_value_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_xattr_value_buf *vb = et->et_object;
@@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
}
-static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
{
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
et->et_max_leaf_clusters =
- ocfs2_clusters_for_bytes(inode->i_sb,
- OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+ ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
}
static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(xt->xt_last_eb_blk);
}
-static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_xattr_block *xb = et->et_object;
@@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(dx_root->dr_last_eb_blk);
}
-static void ocfs2_dx_root_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode,
le32_add_cpu(&dx_root->dr_clusters, clusters);
}
-static int ocfs2_dx_root_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
{
struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_fill_root_el = ocfs2_dx_root_fill_root_el,
};
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
+ .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
+};
+
static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
@@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
{
et->et_ops = ops;
et->et_root_bh = bh;
+ et->et_ci = ci;
et->et_root_journal_access = access;
if (!obj)
obj = (void *)bh->b_data;
@@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
if (!et->et_ops->eo_fill_max_leaf_clusters)
et->et_max_leaf_clusters = 0;
else
- et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+ et->et_ops->eo_fill_max_leaf_clusters(et);
}
void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
NULL, &ocfs2_dinode_et_ops);
}
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
NULL, &ocfs2_xattr_tree_et_ops);
}
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct ocfs2_xattr_value_buf *vb)
{
- __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
+ __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
&ocfs2_xattr_value_et_ops);
}
void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
NULL, &ocfs2_dx_root_et_ops);
}
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+ NULL, &ocfs2_refcount_tree_et_ops);
+}
+
static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 new_last_eb_blk)
{
@@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
return et->et_ops->eo_get_last_eb_blk(et);
}
-static inline void ocfs2_et_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
- et->et_ops->eo_update_clusters(inode, et, clusters);
+ et->et_ops->eo_update_clusters(et, clusters);
+}
+
+static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ if (et->et_ops->eo_extent_map_insert)
+ et->et_ops->eo_extent_map_insert(et, rec);
+}
+
+static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ if (et->et_ops->eo_extent_map_truncate)
+ et->et_ops->eo_extent_map_truncate(et, clusters);
}
static inline int ocfs2_et_root_journal_access(handle_t *handle,
- struct inode *inode,
struct ocfs2_extent_tree *et,
int type)
{
- return et->et_root_journal_access(handle, inode, et->et_root_bh,
+ return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
type);
}
-static inline int ocfs2_et_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static inline enum ocfs2_contig_type
+ ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ if (et->et_ops->eo_extent_contig)
+ return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+ return ocfs2_extent_rec_contig(
+ ocfs2_metadata_cache_get_super(et->et_ci),
+ rec, insert_rec);
+}
+
+static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
{
int ret = 0;
if (et->et_ops->eo_insert_check)
- ret = et->et_ops->eo_insert_check(inode, et, rec);
+ ret = et->et_ops->eo_insert_check(et, rec);
return ret;
}
-static inline int ocfs2_et_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
{
int ret = 0;
if (et->et_ops->eo_sanity_check)
- ret = et->et_ops->eo_sanity_check(inode, et);
+ ret = et->et_ops->eo_sanity_check(et);
return ret;
}
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
-
-/*
- * Structures which describe a path through a btree, and functions to
- * manipulate them.
- *
- * The idea here is to be as generic as possible with the tree
- * manipulation code.
- */
-struct ocfs2_path_item {
- struct buffer_head *bh;
- struct ocfs2_extent_list *el;
-};
-
-#define OCFS2_MAX_PATH_DEPTH 5
-
-struct ocfs2_path {
- int p_tree_depth;
- ocfs2_journal_access_func p_root_access;
- struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
-};
-
-#define path_root_bh(_path) ((_path)->p_node[0].bh)
-#define path_root_el(_path) ((_path)->p_node[0].el)
-#define path_root_access(_path)((_path)->p_root_access)
-#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-
-static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
- u32 cpos);
-static void ocfs2_adjust_rightmost_records(struct inode *inode,
- handle_t *handle,
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec);
/*
@@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
* to build another path. Generally, this involves freeing the buffer
* heads.
*/
-static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
{
int i, start = 0, depth = 0;
struct ocfs2_path_item *node;
@@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
path->p_tree_depth = depth;
}
-static void ocfs2_free_path(struct ocfs2_path *path)
+void ocfs2_free_path(struct ocfs2_path *path)
{
if (path) {
ocfs2_reinit_path(path, 0);
@@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
return path;
}
-static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
{
return ocfs2_new_path(path_root_bh(path), path_root_el(path),
path_root_access(path));
}
-static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
{
return ocfs2_new_path(et->et_root_bh, et->et_root_el,
et->et_root_journal_access);
@@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
* I don't like the way this function's name looks next to
* ocfs2_journal_access_path(), but I don't have a better one.
*/
-static int ocfs2_path_bh_journal_access(handle_t *handle,
- struct inode *inode,
- struct ocfs2_path *path,
- int idx)
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx)
{
ocfs2_journal_access_func access = path_root_access(path);
@@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
if (idx)
access = ocfs2_journal_access_eb;
- return access(handle, inode, path->p_node[idx].bh,
+ return access(handle, ci, path->p_node[idx].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
}
/*
* Convenience function to journal all components in a path.
*/
-static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path)
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path)
{
int i, ret = 0;
@@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
goto out;
for(i = 0; i < path_num_items(path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
+ ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
return ret;
}
-enum ocfs2_contig_type {
- CONTIG_NONE = 0,
- CONTIG_LEFT,
- CONTIG_RIGHT,
- CONTIG_LEFTRIGHT,
-};
-
-
/*
* NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
- * ocfs2_extent_contig only work properly against leaf nodes!
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
*/
static int ocfs2_block_extent_contig(struct super_block *sb,
struct ocfs2_extent_rec *ext,
@@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
}
static enum ocfs2_contig_type
- ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- struct ocfs2_extent_rec *insert_rec)
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
@@ -753,12 +837,12 @@ static enum ocfs2_contig_type
return CONTIG_NONE;
if (ocfs2_extents_adjacent(ext, insert_rec) &&
- ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+ ocfs2_block_extent_contig(sb, ext, blkno))
return CONTIG_RIGHT;
blkno = le64_to_cpu(ext->e_blkno);
if (ocfs2_extents_adjacent(insert_rec, ext) &&
- ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+ ocfs2_block_extent_contig(sb, insert_rec, blkno))
return CONTIG_LEFT;
return CONTIG_NONE;
@@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
return 0;
}
-int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
struct buffer_head **bh)
{
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+ rc = ocfs2_read_block(ci, eb_blkno, &tmp,
ocfs2_validate_extent_block);
/* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
* How many free extents have we got before we need more meta data?
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
struct ocfs2_extent_tree *et)
{
int retval;
@@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
last_eb_blk = ocfs2_et_get_last_eb_blk(et);
if (last_eb_blk) {
- retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
+ retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
+ &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
@@ -913,9 +997,8 @@ bail:
* sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
* l_count for you
*/
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_create_new_meta_bhs(handle_t *handle,
+ struct ocfs2_extent_tree *et,
int wanted,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head *bhs[])
@@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
u16 suballoc_bit_start;
u32 num_got;
u64 first_blkno;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
struct ocfs2_extent_block *eb;
mlog_entry_void();
@@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+ ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
- status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+ status = ocfs2_journal_access_eb(handle, et->et_ci,
+ bhs[i],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
* extent block's rightmost record.
*/
static int ocfs2_adjust_rightmost_branch(handle_t *handle,
- struct inode *inode,
struct ocfs2_extent_tree *et)
{
int status;
@@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
return status;
}
- status = ocfs2_find_path(inode, path, UINT_MAX);
+ status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
goto out;
}
- status = ocfs2_journal_access_path(inode, handle, path);
+ status = ocfs2_journal_access_path(et->et_ci, handle, path);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
el = path_leaf_el(path);
rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
- ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
out:
ocfs2_free_path(path);
@@ -1068,7 +1153,7 @@ out:
/*
* Add an entire tree branch to our inode. eb_bh is the extent block
- * to start at, if we don't want to start the branch at the dinode
+ * to start at, if we don't want to start the branch at the root
* structure.
*
* last_eb_bh is required as we have to update it's next_leaf pointer
@@ -1077,9 +1162,7 @@ out:
* the new branch will be 'empty' in the sense that every block will
* contain a single record with cluster count == 0.
*/
-static int ocfs2_add_branch(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_add_branch(handle_t *handle,
struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
struct buffer_head **last_eb_bh,
@@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
if (root_end > new_cpos) {
mlog(0, "adjust the cluster end from %u to %u\n",
root_end, new_cpos);
- status = ocfs2_adjust_rightmost_branch(handle, inode, et);
+ status = ocfs2_adjust_rightmost_branch(handle, et);
if (status) {
mlog_errno(status);
goto bail;
@@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+ status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
meta_ac, new_eb_bhs);
if (status < 0) {
mlog_errno(status);
@@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- status = ocfs2_journal_access_eb(handle, inode, bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (eb_bh) {
- status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1274,9 +1357,7 @@ bail:
* returns back the new extent block so you can add a branch to it
* after this call.
*/
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
@@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
mlog_entry_void();
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
&new_eb_bh);
if (status < 0) {
mlog_errno(status);
@@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
eb_el = &eb->h_list;
root_el = et->et_root_el;
- status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1379,9 +1460,7 @@ bail:
*
* return status < 0 indicates an error.
*/
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_extent_tree *et,
+static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
struct buffer_head **target_bh)
{
int status = 0, i;
@@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(inode->i_sb, "Dinode %llu has empty "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty "
"extent list (next_free_rec == 0)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(inode->i_sb, "Dinode %llu has extent "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent "
"list where extent # %d has no physical "
"block start",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
}
@@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
brelse(bh);
bh = NULL;
- status = ocfs2_read_extent_block(inode, blkno, &bh);
+ status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1460,20 +1541,18 @@ bail:
*
* *last_eb_bh will be updated by ocfs2_add_branch().
*/
-static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
- struct ocfs2_extent_tree *et, int *final_depth,
- struct buffer_head **last_eb_bh,
+static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ int *final_depth, struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret, shift;
struct ocfs2_extent_list *el = et->et_root_el;
int depth = le16_to_cpu(el->l_tree_depth);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *bh = NULL;
BUG_ON(meta_ac == NULL);
- shift = ocfs2_find_branch_target(osb, inode, et, &bh);
+ shift = ocfs2_find_branch_target(et, &bh);
if (shift < 0) {
ret = shift;
mlog_errno(ret);
@@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
* ocfs2_add_branch). */
- ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
- meta_ac, &bh);
+ ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
mlog(0, "add branch. bh = %p\n", bh);
- ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
+ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
if (ret < 0) {
mlog_errno(ret);
@@ -1687,7 +1765,7 @@ set_and_inc:
*
* The array index of the subtree root is passed back.
*/
-static int ocfs2_find_subtree_root(struct inode *inode,
+static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
struct ocfs2_path *left,
struct ocfs2_path *right)
{
@@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode,
* The caller didn't pass two adjacent paths.
*/
mlog_bug_on_msg(i > left->p_tree_depth,
- "Inode %lu, left depth %u, right depth %u\n"
+ "Owner %llu, left depth %u, right depth %u\n"
"left leaf blk %llu, right leaf blk %llu\n",
- inode->i_ino, left->p_tree_depth,
- right->p_tree_depth,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ left->p_tree_depth, right->p_tree_depth,
(unsigned long long)path_leaf_bh(left)->b_blocknr,
(unsigned long long)path_leaf_bh(right)->b_blocknr);
} while (left->p_node[i].bh->b_blocknr ==
@@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *);
* This code can be called with a cpos larger than the tree, in which
* case it will return the rightmost path.
*/
-static int __ocfs2_find_path(struct inode *inode,
+static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *root_el, u32 cpos,
path_insert_t *func, void *data)
{
@@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
el = root_el;
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has empty extent list at "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has empty extent list at "
"depth %u\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
goto out;
@@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has bad blkno in extent list "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad blkno in extent list "
"at depth %u (index %d)\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
goto out;
@@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_extent_block(inode, blkno, &bh);
+ ret = ocfs2_read_extent_block(ci, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has bad count in extent list "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad count in extent list "
"at block %llu (next free=%u, count=%u)\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
le16_to_cpu(el->l_count));
@@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh)
ocfs2_path_insert_eb(fp->path, fp->index, bh);
fp->index++;
}
-static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
- u32 cpos)
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path, u32 cpos)
{
struct find_path_data data;
data.index = 1;
data.path = path;
- return __ocfs2_find_path(inode, path_root_el(path), cpos,
+ return __ocfs2_find_path(ci, path_root_el(path), cpos,
find_path_ins, &data);
}
@@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
*
* This function doesn't handle non btree extent lists.
*/
-int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
- u32 cpos, struct buffer_head **leaf_bh)
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh)
{
int ret;
struct buffer_head *bh = NULL;
- ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+ ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
* - When we've adjusted the last extent record in the left path leaf and the
* 1st extent record in the right path leaf during cross extent block merge.
*/
-static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+static void ocfs2_complete_edge_insert(handle_t *handle,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
@@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
mlog_errno(ret);
}
-static int ocfs2_rotate_subtree_right(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_subtree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
@@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
left_el = path_leaf_el(left_path);
if (left_el->l_next_free_rec != left_el->l_count) {
- ocfs2_error(inode->i_sb,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
"Inode %llu has non-full interior leaf node %llu"
"(next free = %u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
return -EROFS;
@@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
/* This is a code error, not a disk corruption. */
mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
"because rightmost leaf block %llu is empty\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)right_leaf_bh->b_blocknr);
ocfs2_create_empty_extent(right_el);
@@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
goto out;
}
- ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
- subtree_index);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
out:
return ret;
@@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int op_credits,
struct ocfs2_path *path)
{
+ int ret;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits)
- return ocfs2_extend_trans(handle, credits);
+ if (handle->h_buffer_credits < credits) {
+ ret = ocfs2_extend_trans(handle,
+ credits - handle->h_buffer_credits);
+ if (ret)
+ return ret;
+
+ if (unlikely(handle->h_buffer_credits < credits))
+ return ocfs2_extend_trans(handle, credits);
+ }
return 0;
}
@@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
* *ret_left_path will contain a valid path which can be passed to
* ocfs2_insert_path().
*/
-static int ocfs2_rotate_tree_right(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_tree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
enum ocfs2_split_type split,
u32 insert_cpos,
struct ocfs2_path *right_path,
@@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
int ret, start, orig_credits = handle->h_buffer_credits;
u32 cpos;
struct ocfs2_path *left_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
*ret_left_path = NULL;
@@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
insert_cpos, cpos);
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
mlog_bug_on_msg(path_leaf_bh(left_path) ==
path_leaf_bh(right_path),
- "Inode %lu: error during insert of %u "
+ "Owner %llu: error during insert of %u "
"(left path cpos %u) results in two identical "
"paths ending at %llu\n",
- inode->i_ino, insert_cpos, cpos,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos,
(unsigned long long)
path_leaf_bh(left_path)->b_blocknr);
@@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out_ret_path;
}
- start = ocfs2_find_subtree_root(inode, left_path, right_path);
+ start = ocfs2_find_subtree_root(et, left_path, right_path);
mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
start,
@@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
- ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+ ret = ocfs2_rotate_subtree_right(handle, et, left_path,
right_path, start);
if (ret) {
mlog_errno(ret);
@@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
*/
ocfs2_mv_path(right_path, left_path);
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
- &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2477,7 +2564,8 @@ out_ret_path:
return ret;
}
-static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+static int ocfs2_update_edge_lengths(handle_t *handle,
+ struct ocfs2_extent_tree *et,
int subtree_index, struct ocfs2_path *path)
{
int i, idx, ret;
@@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2532,7 +2620,8 @@ out:
return ret;
}
-static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+static void ocfs2_unlink_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_cached_dealloc_ctxt *dealloc,
struct ocfs2_path *path, int unlink_start)
{
@@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
mlog(ML_ERROR,
"Inode %llu, attempted to remove extent block "
"%llu with %u records\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(el->l_next_free_rec));
ocfs2_journal_dirty(handle, bh);
- ocfs2_remove_from_cache(inode, bh);
+ ocfs2_remove_from_cache(et->et_ci, bh);
continue;
}
@@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
if (ret)
mlog_errno(ret);
- ocfs2_remove_from_cache(inode, bh);
+ ocfs2_remove_from_cache(et->et_ci, bh);
}
}
-static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+static void ocfs2_unlink_subtree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index,
@@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, root_bh);
ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- ocfs2_unlink_path(inode, handle, dealloc, right_path,
+ ocfs2_unlink_path(handle, et, dealloc, right_path,
subtree_index + 1);
}
-static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+static int ocfs2_rotate_subtree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- int *deleted,
- struct ocfs2_extent_tree *et)
+ int *deleted)
{
int ret, i, del_right_subtree = 0, right_has_empty = 0;
struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
@@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
return -EAGAIN;
if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
- ret = ocfs2_journal_access_eb(handle, inode,
+ ret = ocfs2_journal_access_eb(handle, et->et_ci,
path_leaf_bh(right_path),
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
*/
BUG_ON(right_has_empty && !del_right_subtree);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
mlog_errno(ret);
if (del_right_subtree) {
- ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+ ocfs2_unlink_subtree(handle, et, left_path, right_path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
left_path);
if (ret) {
mlog_errno(ret);
@@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
*deleted = 1;
} else
- ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
out:
@@ -2852,8 +2942,8 @@ out:
return ret;
}
-static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path)
{
int ret;
@@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
if (!ocfs2_is_empty_extent(&el->l_recs[0]))
return 0;
- ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
@@ -2880,24 +2970,24 @@ out:
return ret;
}
-static int __ocfs2_rotate_tree_left(struct inode *inode,
- handle_t *handle, int orig_credits,
+static int __ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int orig_credits,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_path **empty_extent_path,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_path **empty_extent_path)
{
int ret, subtree_root, deleted;
u32 right_cpos;
struct ocfs2_path *left_path = NULL;
struct ocfs2_path *right_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
*empty_extent_path = NULL;
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
- &right_cpos);
+ ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
}
while (right_cpos) {
- ret = ocfs2_find_path(inode, right_path, right_cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- subtree_root = ocfs2_find_subtree_root(inode, left_path,
+ subtree_root = ocfs2_find_subtree_root(et, left_path,
right_path);
mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
@@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
* Caller might still want to make changes to the
* tree root, so re-add it to the journal here.
*/
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+ ret = ocfs2_rotate_subtree_left(handle, et, left_path,
right_path, subtree_root,
- dealloc, &deleted, et);
+ dealloc, &deleted);
if (ret == -EAGAIN) {
/*
* The rotation has to temporarily stop due to
@@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ocfs2_mv_path(left_path, right_path);
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
&right_cpos);
if (ret) {
mlog_errno(ret);
@@ -2997,10 +3087,10 @@ out:
return ret;
}
-static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+static int ocfs2_remove_rightmost_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, subtree_index;
u32 cpos;
@@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
struct ocfs2_extent_list *el;
- ret = ocfs2_et_sanity_check(inode, et);
+ ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
/*
@@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret) {
mlog_errno(ret);
goto out;
}
- subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
- ocfs2_unlink_subtree(inode, handle, left_path, path,
+ ocfs2_unlink_subtree(handle, et, left_path, path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
left_path);
if (ret) {
mlog_errno(ret);
@@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
* 'path' is also the leftmost path which
* means it must be the only one. This gets
* handled differently because we want to
- * revert the inode back to having extents
+ * revert the root back to having extents
* in-line.
*/
- ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+ ocfs2_unlink_path(handle, et, dealloc, path, 1);
el = et->et_root_el;
el->l_tree_depth = 0;
@@ -3114,10 +3205,10 @@ out:
* the rightmost tree leaf record is removed so the caller is
* responsible for detecting and correcting that.
*/
-static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+static int ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, orig_credits = handle->h_buffer_credits;
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -3134,8 +3225,7 @@ rightmost_no_delete:
* Inline extents. This is trivially handled, so do
* it up front.
*/
- ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
- path);
+ ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
if (ret)
mlog_errno(ret);
goto out;
@@ -3151,7 +3241,7 @@ rightmost_no_delete:
*
* 1) is handled via ocfs2_rotate_rightmost_leaf_left()
* 2a) we need the left branch so that we can update it with the unlink
- * 2b) we need to bring the inode back to inline extents.
+ * 2b) we need to bring the root back to inline extents.
*/
eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
@@ -3167,9 +3257,9 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has empty extent block at %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
}
@@ -3183,8 +3273,8 @@ rightmost_no_delete:
* nonempty list.
*/
- ret = ocfs2_remove_rightmost_path(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_remove_rightmost_path(handle, et, path,
+ dealloc);
if (ret)
mlog_errno(ret);
goto out;
@@ -3195,8 +3285,8 @@ rightmost_no_delete:
* and restarting from there.
*/
try_rotate:
- ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
- dealloc, &restart_path, et);
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
+ dealloc, &restart_path);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -3206,9 +3296,9 @@ try_rotate:
tmp_path = restart_path;
restart_path = NULL;
- ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
tmp_path, dealloc,
- &restart_path, et);
+ &restart_path);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
}
}
-static int ocfs2_get_right_path(struct inode *inode,
+static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path **ret_right_path)
{
@@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode,
left_el = path_leaf_el(left_path);
BUG_ON(left_el->l_next_free_rec != left_el->l_count);
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
- &right_cpos);
+ ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ left_path, &right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, right_path, right_cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3313,9 +3403,9 @@ out:
* For index == l_count - 1, the "next" means the 1st extent rec of the
* next extent block.
*/
-static int ocfs2_merge_rec_right(struct inode *inode,
- struct ocfs2_path *left_path,
+static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *split_rec,
int index)
{
@@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
/* we meet with a cross extent block merge. */
- ret = ocfs2_get_right_path(inode, left_path, &right_path);
+ ret = ocfs2_get_right_path(et, left_path, &right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
le16_to_cpu(left_rec->e_leaf_clusters) !=
le32_to_cpu(right_rec->e_cpos));
- subtree_index = ocfs2_find_subtree_root(inode,
- left_path, right_path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
handle->h_buffer_credits,
@@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
right_rec = &el->l_recs[index + 1];
}
- ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
path_num_items(left_path) - 1);
if (ret) {
mlog_errno(ret);
@@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
le32_add_cpu(&right_rec->e_cpos, -split_clusters);
le64_add_cpu(&right_rec->e_blkno,
- -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
ocfs2_cleanup_merge(el, index);
@@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
if (ret)
mlog_errno(ret);
- ocfs2_complete_edge_insert(inode, handle, left_path,
- right_path, subtree_index);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
}
out:
if (right_path)
@@ -3432,7 +3523,7 @@ out:
return ret;
}
-static int ocfs2_get_left_path(struct inode *inode,
+static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
@@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode,
/* This function shouldn't be called for non-trees. */
BUG_ON(right_path->p_tree_depth == 0);
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
right_path, &left_cpos);
if (ret) {
mlog_errno(ret);
@@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3485,12 +3576,11 @@ out:
* remove the rightmost leaf extent block in the right_path and change
* the right path to indicate the new rightmost path.
*/
-static int ocfs2_merge_rec_left(struct inode *inode,
- struct ocfs2_path *right_path,
+static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et,
int index)
{
int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
right_rec = &el->l_recs[index];
if (index == 0) {
/* we meet with a cross extent block merge. */
- ret = ocfs2_get_left_path(inode, right_path, &left_path);
+ ret = ocfs2_get_left_path(et, right_path, &left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le16_to_cpu(left_rec->e_leaf_clusters) !=
le32_to_cpu(split_rec->e_cpos));
- subtree_index = ocfs2_find_subtree_root(inode,
- left_path, right_path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
handle->h_buffer_credits,
@@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
has_empty_extent = 1;
}
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
path_num_items(right_path) - 1);
if (ret) {
mlog_errno(ret);
@@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le32_add_cpu(&right_rec->e_cpos, split_clusters);
le64_add_cpu(&right_rec->e_blkno,
- ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
ocfs2_cleanup_merge(el, index);
@@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode,
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
- ret = ocfs2_remove_rightmost_path(inode, handle,
+ ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
- dealloc, et);
+ dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
ocfs2_mv_path(right_path, left_path);
left_path = NULL;
} else
- ocfs2_complete_edge_insert(inode, handle, left_path,
+ ocfs2_complete_edge_insert(handle, left_path,
right_path, subtree_index);
}
out:
@@ -3631,15 +3722,13 @@ out:
return ret;
}
-static int ocfs2_try_to_merge_extent(struct inode *inode,
- handle_t *handle,
+static int ocfs2_try_to_merge_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_merge_ctxt *ctxt,
- struct ocfs2_extent_tree *et)
-
+ struct ocfs2_merge_ctxt *ctxt)
{
int ret = 0;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* extents - having more than one in a leaf is
* illegal.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* prevoius extent block. It is more efficient and easier
* if we do merge_right first and merge_left later.
*/
- ret = ocfs2_merge_rec_right(inode, path,
- handle, split_rec,
+ ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
/* The merge left us with an empty extent, remove it. */
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* Note that we don't pass split_rec here on purpose -
* we've merged it into the rec already.
*/
- ret = ocfs2_merge_rec_left(inode, path,
- handle, rec,
- dealloc, et,
- split_index);
+ ret = ocfs2_merge_rec_left(path, handle, et, rec,
+ dealloc, split_index);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
* print but don't bubble it up.
@@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* the record on the left (hence the left merge).
*/
if (ctxt->c_contig_type == CONTIG_RIGHT) {
- ret = ocfs2_merge_rec_left(inode,
- path,
- handle, split_rec,
- dealloc, et,
+ ret = ocfs2_merge_rec_left(path, handle, et,
+ split_rec, dealloc,
split_index);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_merge_rec_right(inode,
- path,
- handle, split_rec,
+ ret = ocfs2_merge_rec_right(path, handle,
+ et, split_rec,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path,
+ dealloc);
if (ret)
mlog_errno(ret);
ret = 0;
@@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb,
* list. If this leaf is part of an allocation tree, it is assumed
* that the tree above has been prepared.
*/
-static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
struct ocfs2_extent_list *el,
- struct ocfs2_insert_type *insert,
- struct inode *inode)
+ struct ocfs2_insert_type *insert)
{
int i = insert->ins_contig_index;
unsigned int range;
@@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
BUG_ON(i == -1);
rec = &el->l_recs[i];
- ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ insert->ins_split, rec,
insert_rec);
goto rotate;
}
@@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
le16_to_cpu(el->l_count),
- "inode %lu, depth %u, count %u, next free %u, "
+ "owner %llu, depth %u, count %u, next free %u, "
"rec.cpos %u, rec.clusters %u, "
"insert.cpos %u, insert.clusters %u\n",
- inode->i_ino,
+ ocfs2_metadata_cache_owner(et->et_ci),
le16_to_cpu(el->l_tree_depth),
le16_to_cpu(el->l_count),
le16_to_cpu(el->l_next_free_rec),
@@ -3900,8 +3981,8 @@ rotate:
ocfs2_rotate_leaf(el, insert_rec);
}
-static void ocfs2_adjust_rightmost_records(struct inode *inode,
- handle_t *handle,
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec)
{
@@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has a bad extent list",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
}
@@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
}
}
-static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+static int ocfs2_append_rec_to_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
@@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
u32 left_cpos;
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
- &left_cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ right_path, &left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
}
}
- ret = ocfs2_journal_access_path(inode, handle, right_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
+ ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
*ret_left_path = left_path;
ret = 0;
@@ -4022,7 +4105,7 @@ out:
return ret;
}
-static void ocfs2_split_record(struct inode *inode,
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
struct ocfs2_extent_rec *split_rec,
@@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode,
}
rec = &el->l_recs[index];
- ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ split, rec, split_rec);
ocfs2_rotate_leaf(insert_el, split_rec);
}
@@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode,
* in. left_path should only be passed in if we need to update that
* portion of the tree after an edge insert.
*/
-static int ocfs2_insert_path(struct inode *inode,
- handle_t *handle,
+static int ocfs2_insert_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
struct ocfs2_extent_rec *insert_rec,
@@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode,
* Pass both paths to the journal. The majority of inserts
* will be touching all components anyway.
*/
- ret = ocfs2_journal_access_path(inode, handle, right_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode,
* of splits, but it's easier to just let one separate
* function sort it all out.
*/
- ocfs2_split_record(inode, left_path, right_path,
+ ocfs2_split_record(et, left_path, right_path,
insert_rec, insert->ins_split);
/*
@@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode,
if (ret)
mlog_errno(ret);
} else
- ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
- insert, inode);
+ ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+ insert);
ret = ocfs2_journal_dirty(handle, leaf_bh);
if (ret)
@@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode,
*
* XXX: Should we extend the transaction here?
*/
- subtree_index = ocfs2_find_subtree_root(inode, left_path,
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
right_path);
- ocfs2_complete_edge_insert(inode, handle, left_path,
- right_path, subtree_index);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
}
ret = 0;
@@ -4196,8 +4280,7 @@ out:
return ret;
}
-static int ocfs2_do_insert_extent(struct inode *inode,
- handle_t *handle,
+static int ocfs2_do_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *type)
@@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
el = et->et_root_el;
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
if (le16_to_cpu(el->l_tree_depth) == 0) {
- ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+ ocfs2_insert_at_leaf(et, insert_rec, el, type);
goto out_update_clusters;
}
@@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
cpos = UINT_MAX;
}
- ret = ocfs2_find_path(inode, right_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* can wind up skipping both of these two special cases...
*/
if (rotate) {
- ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
+ ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
le32_to_cpu(insert_rec->e_cpos),
right_path, &left_path);
if (ret) {
@@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
} else if (type->ins_appending == APPEND_TAIL
&& type->ins_contig != CONTIG_LEFT) {
- ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+ ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
right_path, &left_path);
if (ret) {
mlog_errno(ret);
@@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
}
- ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+ ret = ocfs2_insert_path(handle, et, left_path, right_path,
insert_rec, type);
if (ret) {
mlog_errno(ret);
@@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
out_update_clusters:
if (type->ins_split == SPLIT_NONE)
- ocfs2_et_update_clusters(inode, et,
+ ocfs2_et_update_clusters(et,
le16_to_cpu(insert_rec->e_leaf_clusters));
ret = ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4312,7 +4395,8 @@ out:
}
static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
struct ocfs2_extent_rec *split_rec)
{
@@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
struct ocfs2_path *left_path = NULL, *right_path = NULL;
struct buffer_head *bh;
struct ocfs2_extent_block *eb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
if (index > 0) {
rec = &el->l_recs[index - 1];
} else if (path->p_tree_depth > 0) {
- status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
- path, &left_cpos);
+ status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (status)
goto out;
@@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (!left_path)
goto out;
- status = ocfs2_find_path(inode, left_path, left_cpos);
+ status = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (status)
goto out;
@@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(inode->i_sb,
+ ocfs2_error(sb,
"Extent block #%llu has an "
"invalid l_next_free_rec of "
"%d. It should have "
@@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (split_rec->e_cpos == el->l_recs[index].e_cpos)
ret = CONTIG_RIGHT;
} else {
- ret = ocfs2_extent_contig(inode, rec, split_rec);
+ ret = ocfs2_et_extent_contig(et, rec, split_rec);
}
}
@@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
rec = &el->l_recs[index + 1];
else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
path->p_tree_depth > 0) {
- status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
- path, &right_cpos);
+ status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (status)
goto out;
@@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (!right_path)
goto out;
- status = ocfs2_find_path(inode, right_path, right_cpos);
+ status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
goto out;
@@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(inode->i_sb,
+ ocfs2_error(sb,
"Extent block #%llu has an "
"invalid l_next_free_rec of %d",
(unsigned long long)le64_to_cpu(eb->h_blkno),
@@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (rec) {
enum ocfs2_contig_type contig_type;
- contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+ contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
ret = CONTIG_LEFTRIGHT;
@@ -4436,11 +4520,10 @@ out:
return ret;
}
-static void ocfs2_figure_contig_type(struct inode *inode,
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
- struct ocfs2_extent_rec *insert_rec,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_extent_rec *insert_rec)
{
int i;
enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
- insert_rec);
+ contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+ insert_rec);
if (contig_type != CONTIG_NONE) {
insert->ins_contig_index = i;
break;
@@ -4530,8 +4613,7 @@ set_tail_append:
* All of the information is stored on the ocfs2_insert_type
* structure.
*/
-static int ocfs2_figure_insert_type(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
struct ocfs2_extent_rec *insert_rec,
int *free_records,
@@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&bh);
if (ret) {
@@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
le16_to_cpu(el->l_next_free_rec);
if (!insert->ins_tree_depth) {
- ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
ocfs2_figure_appending_type(insert, el, insert_rec);
return 0;
}
@@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* us the rightmost tree path. This is accounted for below in
* the appending code.
*/
- ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+ ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
if (ret) {
mlog_errno(ret);
goto out;
@@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* into two types of appends: simple record append, or a
* rotate inside the tail leaf.
*/
- ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
/*
* The insert code isn't quite ready to deal with all cases of
@@ -4657,13 +4739,11 @@ out:
}
/*
- * Insert an extent into an inode btree.
+ * Insert an extent into a btree.
*
- * The caller needs to update fe->i_clusters
+ * The caller needs to update the owning btree's cluster count.
*/
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
@@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
- mlog(0, "add %u clusters at position %u to inode %llu\n",
- new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ mlog(0, "add %u clusters at position %u to owner %llu\n",
+ new_clusters, cpos,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
rec.e_flags = flags;
- status = ocfs2_et_insert_check(inode, et, &rec);
+ status = ocfs2_et_insert_check(et, &rec);
if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
+ status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
&free_records, &insert);
if (status < 0) {
mlog_errno(status);
@@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
free_records, insert.ins_tree_depth);
if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
- status = ocfs2_grow_tree(inode, handle, et,
+ status = ocfs2_grow_tree(handle, et,
&insert.ins_tree_depth, &last_eb_bh,
meta_ac);
if (status) {
@@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
}
/* Finally, we can add clusters. This might rotate the tree for us. */
- status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
+ status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
if (status < 0)
mlog_errno(status);
- else if (et->et_ops == &ocfs2_dinode_et_ops)
- ocfs2_extent_map_insert_rec(inode, &rec);
+ else
+ ocfs2_et_extent_map_insert(et, &rec);
bail:
brelse(last_eb_bh);
@@ -4735,13 +4816,11 @@ bail:
* it is not limited to the file storage. Any extent tree can use this
* function if it implements the proper ocfs2_extent_tree.
*/
-int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
- struct inode *inode,
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
@@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
u32 bit_off, num_bits;
u64 block;
u8 flags = 0;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
BUG_ON(!clusters_to_add);
if (mark_unwritten)
flags = OCFS2_EXT_UNWRITTEN;
- free_extents = ocfs2_num_free_extents(osb, inode, et);
+ free_extents = ocfs2_num_free_extents(osb, et);
if (free_extents < 0) {
status = free_extents;
mlog_errno(status);
@@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
BUG_ON(num_bits > clusters_to_add);
/* reserve our write early -- insert_extent may update the tree root */
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, et,
- *logical_offset, block,
+ mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
+ num_bits, bit_off,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ status = ocfs2_insert_extent(handle, et, *logical_offset, block,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
@@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
split_rec->e_flags = rec->e_flags;
}
-static int ocfs2_split_and_insert(struct inode *inode,
- handle_t *handle,
- struct ocfs2_path *path,
+static int ocfs2_split_and_insert(handle_t *handle,
struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
struct buffer_head **last_eb_bh,
int split_index,
struct ocfs2_extent_rec *orig_split_rec,
@@ -4892,7 +4972,7 @@ leftright:
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, et,
+ ret = ocfs2_grow_tree(handle, et,
&depth, last_eb_bh, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4921,8 +5001,8 @@ leftright:
*/
insert.ins_split = SPLIT_RIGHT;
- ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
- &rec);
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &tmprec, insert_range, &rec);
split_rec = tmprec;
@@ -4930,7 +5010,7 @@ leftright:
do_leftright = 1;
}
- ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4946,7 +5026,7 @@ leftright:
ocfs2_reinit_path(path, 1);
cpos = le32_to_cpu(split_rec.e_cpos);
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4961,8 +5041,8 @@ out:
return ret;
}
-static int ocfs2_replace_extent_rec(struct inode *inode,
- handle_t *handle,
+static int ocfs2_replace_extent_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_list *el,
int split_index,
@@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode,
{
int ret;
- ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
@@ -4985,9 +5065,8 @@ out:
}
/*
- * Mark part or all of the extent record at split_index in the leaf
- * pointed to by path as written. This removes the unwritten
- * extent flag.
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
*
* Care is taken to handle contiguousness so as to not grow the tree.
*
@@ -5004,14 +5083,13 @@ out:
* have been brought into cache (and pinned via the journal), so the
* extra overhead is not expressed in terms of disk reads.
*/
-static int __ocfs2_mark_extent_written(struct inode *inode,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
- struct ocfs2_path *path,
- int split_index,
- struct ocfs2_extent_rec *split_rec,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret = 0;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
struct ocfs2_merge_ctxt ctxt;
struct ocfs2_extent_list *rightmost_el;
- if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = -EIO;
- mlog_errno(ret);
- goto out;
- }
-
if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
(le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
split_index,
split_rec);
/*
* The core merge / split code wants to know how much room is
- * left in this inodes allocation tree, so we pass the
+ * left in this allocation tree, so we pass the
* rightmost extent list.
*/
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
if (ret) {
@@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
if (ctxt.c_contig_type == CONTIG_NONE) {
if (ctxt.c_split_covers_rec)
- ret = ocfs2_replace_extent_rec(inode, handle,
- path, el,
+ ret = ocfs2_replace_extent_rec(handle, et, path, el,
split_index, split_rec);
else
- ret = ocfs2_split_and_insert(inode, handle, path, et,
+ ret = ocfs2_split_and_insert(handle, et, path,
&last_eb_bh, split_index,
split_rec, meta_ac);
if (ret)
mlog_errno(ret);
} else {
- ret = ocfs2_try_to_merge_extent(inode, handle, path,
+ ret = ocfs2_try_to_merge_extent(handle, et, path,
split_index, split_rec,
- dealloc, &ctxt, et);
+ dealloc, &ctxt);
if (ret)
mlog_errno(ret);
}
@@ -5096,46 +5167,31 @@ out:
}
/*
- * Mark the already-existing extent at cpos as written for len clusters.
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
*
* If the existing extent is larger than the request, initiate a
* split. An attempt will be made at merging with adjacent extents.
*
* The caller is responsible for passing down meta_ac if we'll need it.
*/
-int ocfs2_mark_extent_written(struct inode *inode,
- struct ocfs2_extent_tree *et,
- handle_t *handle, u32 cpos, u32 len, u32 phys,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags)
{
int ret, index;
- u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
struct ocfs2_extent_rec split_rec;
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el;
-
- mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
- inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
-
- if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- ret = -EROFS;
- goto out;
- }
-
- /*
- * XXX: This should be fixed up so that we just re-insert the
- * next extent records.
- *
- * XXX: This is a hack on the extent tree, maybe it should be
- * an op?
- */
- if (et->et_ops == &ocfs2_dinode_et_ops)
- ocfs2_extent_map_trunc(inode, 0);
+ struct ocfs2_extent_rec *rec;
left_path = ocfs2_new_path_from_et(et);
if (!left_path) {
@@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
+ ocfs2_error(sb,
+ "Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci), cpos);
ret = -EROFS;
goto out;
}
+ ret = -EIO;
+ rec = &el->l_recs[index];
+ if (new_flags && (rec->e_flags & new_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+ "extent that already had them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ new_flags);
+ goto out;
+ }
+
+ if (clear_flags && !(rec->e_flags & clear_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+ "extent that didn't have them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ clear_flags);
+ goto out;
+ }
+
memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
split_rec.e_cpos = cpu_to_le32(cpos);
split_rec.e_leaf_clusters = cpu_to_le16(len);
split_rec.e_blkno = cpu_to_le64(start_blkno);
- split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
- split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
-
- ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
- index, &split_rec, meta_ac,
- dealloc);
+ split_rec.e_flags = rec->e_flags;
+ if (new_flags)
+ split_rec.e_flags |= new_flags;
+ if (clear_flags)
+ split_rec.e_flags &= ~clear_flags;
+
+ ret = ocfs2_split_extent(handle, et, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
if (ret)
mlog_errno(ret);
out:
ocfs2_free_path(left_path);
return ret;
+
}
-static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
- handle_t *handle, struct ocfs2_path *path,
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
+ inode->i_ino, cpos, len, phys);
+
+ if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+ "that are being written to, but the feature bit "
+ "is not set in the super block.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * XXX: This should be fixed up so that we just re-insert the
+ * next extent records.
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ 0, OCFS2_EXT_UNWRITTEN);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
@@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
*/
el = path_leaf_el(path);
rec = &el->l_recs[index];
- ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &split_rec, new_range, rec);
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
if (ret < 0) {
@@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
+ ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
meta_ac);
if (ret) {
mlog_errno(ret);
@@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
insert.ins_split = SPLIT_RIGHT;
insert.ins_tree_depth = depth;
- ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
if (ret)
mlog_errno(ret);
@@ -5247,23 +5372,23 @@ out:
return ret;
}
-static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+static int ocfs2_truncate_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path, int index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u32 cpos, u32 len,
- struct ocfs2_extent_tree *et)
+ u32 cpos, u32 len)
{
int ret;
u32 left_cpos, rec_range, trunc_range;
int wants_rotate = 0, is_rightmost_tree_rec = 0;
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
* by this leaf and the one to it's left.
*
* There are two cases we can skip:
- * 1) Path is the leftmost one in our inode tree.
+ * 1) Path is the leftmost one in our btree.
* 2) The leaf is rightmost and will be empty after
* we remove the extent record - the rotate code
* knows how to update the newly formed edge.
*/
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
- &left_cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
* be deleted by the rotate code.
*/
rec = &el->l_recs[next_free - 1];
- ocfs2_adjust_rightmost_records(inode, handle, path,
+ ocfs2_adjust_rightmost_records(handle, et, path,
rec);
}
} else if (le32_to_cpu(rec->e_cpos) == cpos) {
@@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
/* Remove rightmost portion of the record */
le16_add_cpu(&rec->e_leaf_clusters, -len);
if (is_rightmost_tree_rec)
- ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
} else {
/* Caller should have trapped this. */
- mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
- "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+ "(%u, %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
le32_to_cpu(rec->e_cpos),
le16_to_cpu(rec->e_leaf_clusters), cpos, len);
BUG();
@@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
if (left_path) {
int subtree_index;
- subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
- ocfs2_complete_edge_insert(inode, handle, left_path, path,
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+ ocfs2_complete_edge_insert(handle, left_path, path,
subtree_index);
}
ocfs2_journal_dirty(handle, path_leaf_bh(path));
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5404,9 +5530,9 @@ out:
return ret;
}
-int ocfs2_remove_extent(struct inode *inode,
+int ocfs2_remove_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 len, handle_t *handle,
+ u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
@@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode,
struct ocfs2_extent_list *el;
struct ocfs2_path *path = NULL;
- ocfs2_extent_map_trunc(inode, 0);
+ /*
+ * XXX: Why are we truncating to 0 instead of wherever this
+ * affects us?
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
path = ocfs2_new_path_from_et(et);
if (!path) {
@@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode,
BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
- mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+ mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
"(cpos %u, len %u)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, index,
le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
- ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len, et);
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_split_tree(inode, et, handle, path, index,
+ ret = ocfs2_split_tree(handle, et, path, index,
trunc_range, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode,
*/
ocfs2_reinit_path(path, 1);
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu: split at cpos %u lost record.",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: split at cpos %u lost record.",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
goto out;
@@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode,
rec_range = le32_to_cpu(rec->e_cpos) +
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
- ocfs2_error(inode->i_sb,
- "Inode %llu: error after split at cpos %u"
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: error after split at cpos %u"
"trunc len %u, existing record is (%u,%u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
- ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len, et);
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto out;
}
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
vfs_dq_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(inode->i_sb, len));
- ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
- dealloc);
+ ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
- ocfs2_et_update_clusters(inode, et, -len);
+ ocfs2_et_update_clusters(et, -len);
ret = ocfs2_journal_dirty(handle, et->et_root_bh);
if (ret) {
@@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
while (i >= 0) {
/* Caller has given us at least enough credits to
* update the truncate log dinode */
- status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
tl->tl_used = 0;
ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
- status = ocfs2_write_block(osb, tl_bh, tl_inode);
+ status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type,
return fl;
}
-static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
- unsigned int bit)
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 blkno,
+ unsigned int bit)
{
int ret;
struct ocfs2_per_slot_free_list *fl;
@@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
goto out;
}
- ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -6551,7 +6682,7 @@ out:
*/
static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
handle_t *handle, struct ocfs2_truncate_context *tc,
- u32 clusters_to_del, u64 *delete_start)
+ u32 clusters_to_del, u64 *delete_start, u8 *flags)
{
int ret, i, index = path->p_tree_depth;
u32 new_edge = 0;
@@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
struct ocfs2_extent_rec *rec;
*delete_start = 0;
+ *flags = 0;
while (index >= 0) {
bh = path->p_node[index].bh;
@@ -6648,6 +6780,7 @@ find_tail_record:
*delete_start = le64_to_cpu(rec->e_blkno)
+ ocfs2_clusters_to_blocks(inode->i_sb,
le16_to_cpu(rec->e_leaf_clusters));
+ *flags = rec->e_flags;
/*
* If it's now empty, remove this record.
@@ -6719,7 +6852,7 @@ delete:
mlog(0, "deleting this extent block.\n");
- ocfs2_remove_from_cache(inode, bh);
+ ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
@@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_truncate_context *tc,
- struct ocfs2_path *path)
+ struct ocfs2_path *path,
+ struct ocfs2_alloc_context *meta_ac)
{
int status;
struct ocfs2_dinode *fe;
@@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
struct ocfs2_extent_list *el;
struct buffer_head *last_eb_bh = NULL;
u64 delete_blk = 0;
+ u8 rec_flags;
fe = (struct ocfs2_dinode *) fe_bh->b_data;
@@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
* Each component will be touched, so we might as well journal
* here to avoid having to handle errors later.
*/
- status = ocfs2_journal_access_path(inode, handle, path);
+ status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (last_eb_bh) {
- status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+ status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_trim_tree(inode, path, handle, tc,
- clusters_to_del, &delete_blk);
+ clusters_to_del, &delete_blk, &rec_flags);
if (status) {
mlog_errno(status);
goto bail;
@@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
if (delete_blk) {
- status = ocfs2_truncate_log_append(osb, handle, delete_blk,
- clusters_to_del);
+ if (rec_flags & OCFS2_EXT_REFCOUNTED)
+ status = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ delete_blk),
+ clusters_to_del, meta_ac,
+ &tc->tc_dealloc, 1);
+ else
+ status = ocfs2_truncate_log_append(osb, handle,
+ delete_blk,
+ clusters_to_del);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
- unsigned int from, unsigned int to,
- struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys)
{
int ret, partial = 0;
@@ -6933,20 +7076,16 @@ out:
ocfs2_unlock_and_free_pages(pages, numpages);
}
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
- struct page **pages, int *num)
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
{
int numpages, ret = 0;
- struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
loff_t last_page_bytes;
BUG_ON(start > end);
- BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
- (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
-
numpages = 0;
last_page_bytes = PAGE_ALIGN(end);
index = start >> PAGE_CACHE_SHIFT;
@@ -6974,6 +7113,17 @@ out:
return ret;
}
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
+{
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+ (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+ return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
/*
* Zero the area past i_size but still within an allocated
* cluster. This avoids exposing nonzero data on subsequent file
@@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_unlock;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* this proves to be false, we could always re-build
* the in-inode data from our pages.
*/
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
- ret = ocfs2_insert_extent(osb, handle, inode, &et,
- 0, block, 1, 0, NULL);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
{
int status, i, credits, tl_sem = 0;
u32 clusters_to_del, new_highest_cpos, range;
+ u64 blkno = 0;
struct ocfs2_extent_list *el;
handle_t *handle = NULL;
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_path *path = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
mlog_entry_void();
@@ -7292,10 +7444,12 @@ start:
goto bail;
}
+ credits = 0;
+
/*
* Truncate always works against the rightmost tree branch.
*/
- status = ocfs2_find_path(inode, path, UINT_MAX);
+ status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
if (status) {
mlog_errno(status);
goto bail;
@@ -7332,10 +7486,15 @@ start:
clusters_to_del = 0;
} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+ blkno = le64_to_cpu(el->l_recs[i].e_blkno);
} else if (range > new_highest_cpos) {
clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
le32_to_cpu(el->l_recs[i].e_cpos)) -
new_highest_cpos;
+ blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ ocfs2_rec_clusters(el, &el->l_recs[i]) -
+ clusters_to_del);
} else {
status = 0;
goto bail;
@@ -7344,6 +7503,29 @@ start:
mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
+ if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ status = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
+ blkno,
+ clusters_to_del,
+ &credits,
+ &meta_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
mutex_lock(&tl_inode->i_mutex);
tl_sem = 1;
/* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7357,7 +7539,7 @@ start:
}
}
- credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+ credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
(struct ocfs2_dinode *)fe_bh->b_data,
el);
handle = ocfs2_start_trans(osb, credits);
@@ -7369,7 +7551,7 @@ start:
}
status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
- tc, path);
+ tc, path, meta_ac);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -7383,6 +7565,16 @@ start:
ocfs2_reinit_path(path, 1);
+ if (meta_ac) {
+ ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+
+ if (ref_tree) {
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ ref_tree = NULL;
+ }
+
/*
* The check above will catch the case where we've truncated
* away all allocation.
@@ -7399,6 +7591,12 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
ocfs2_run_deallocs(osb, &tc->tc_dealloc);
ocfs2_free_path(path);
@@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_extent_block(inode,
+ status = ocfs2_read_extent_block(INODE_CACHE(inode),
le64_to_cpu(fe->i_last_eb_blk),
&last_eb_bh);
if (status < 0) {
@@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 353254ba29e1..9c122d574464 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,8 @@
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions. With metadata ecc, we now call different journal_access
+ * functions. It needs the ocfs2_caching_info structure associated with
+ * I/O on the tree. With metadata ecc, we now call different journal_access
* functions for each type of metadata, so it must have the
* root_journal_access function.
* ocfs2_extent_tree_operations abstract the normal operations we do for
@@ -56,6 +57,7 @@ struct ocfs2_extent_tree {
struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
+ struct ocfs2_caching_info *et_ci;
ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
@@ -66,31 +68,32 @@ struct ocfs2_extent_tree {
* specified object buffer.
*/
void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
struct ocfs2_xattr_value_buf;
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct ocfs2_xattr_value_buf *vb);
void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
/*
* Read an extent block into *bh. If *bh is NULL, a bh will be
* allocated. This is a cached read. The extent block will be validated
* with ocfs2_validate_extent_block().
*/
-int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
struct buffer_head **bh);
struct ocfs2_alloc_context;
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
@@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted {
RESTART_TRANS,
RESTART_META
};
-int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
- struct inode *inode,
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_mark_extent_written(struct inode *inode,
struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode,
- struct ocfs2_extent_tree *et,
- u32 cpos, u32 len, handle_t *handle,
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags);
+int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_remove_btree_range(struct inode *inode,
@@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
struct ocfs2_extent_tree *et);
/*
@@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
}
int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 blkno,
+ unsigned int bit);
static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
{
return c->c_global_allocator != NULL;
@@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
unsigned int start, unsigned int end, int trunc);
-int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
- u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
/*
@@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
return !rec->e_leaf_clusters;
}
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH 5
+
+struct ocfs2_path {
+ int p_tree_depth;
+ ocfs2_journal_access_func p_root_access;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path);
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8a1e61545f41..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
#include "suballoc.h"
#include "super.h"
#include "symlink.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -126,8 +127,8 @@ bail:
return err;
}
-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
int err = 0;
unsigned int ext_flags;
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
+ /* We should already CoW the refcounted extent. */
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
@@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
+ /* Fallback to buffered I/O if we are appending. */
+ if (i_size_read(inode) <= offset)
+ return 0;
+
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset,
nr_segs,
@@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
goto out;
}
} else if (unwritten) {
- ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
wc->w_handle, cpos, 1, phys,
meta_ac, &wc->w_dealloc);
@@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
goto out;
}
+ /* We should already CoW the refcountd extent. */
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
/*
* Assume worst case - that we're writing in
* the middle of the extent.
@@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
ocfs2_commit_trans(osb, handle);
@@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
goto out;
}
+ ret = ocfs2_check_range_for_refcount(inode, pos, len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ } else if (ret == 1) {
+ ret = ocfs2_refcount_cow(inode, di_bh,
+ wc->w_cpos, wc->w_clen, UINT_MAX);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
&extents_to_split);
if (ret) {
@@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
(long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
clusters_to_alloc, extents_to_split);
- ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
ret = ocfs2_lock_allocators(inode, &et,
clusters_to_alloc, extents_to_split,
&data_ac, &meta_ac);
@@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* We don't want this to fail in ocfs2_write_end(), so do it
* here.
*/
- ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1997,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e11..c48e93ffc513 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct buffer_head *di_bh);
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 15c8e6deee2e..d43d34a1dd31 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -52,12 +52,12 @@ enum ocfs2_state_bits {
BUFFER_FNS(NeedsValidate, needs_validate);
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
- struct inode *inode)
+ struct ocfs2_caching_info *ci)
{
int ret = 0;
- mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
- (unsigned long long)bh->b_blocknr, inode);
+ mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
+ (unsigned long long)bh->b_blocknr, ci);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
@@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
goto out;
}
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
lock_buffer(bh);
set_buffer_uptodate(bh);
@@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
@@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
put_bh(bh);
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
out:
mlog_exit(ret);
return ret;
@@ -177,7 +177,7 @@ bail:
return status;
}
-int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
@@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
int status = 0;
int i, ignore_cache = 0;
struct buffer_head *bh;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
- mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
- inode, (unsigned long long)block, nr, flags);
+ mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
+ ci, (unsigned long long)block, nr, flags);
- BUG_ON(!inode);
+ BUG_ON(!ci);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
(flags & OCFS2_BH_IGNORE_CACHE));
@@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
goto bail;
}
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
- bhs[i] = sb_getblk(inode->i_sb, block++);
+ bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
status = -EIO;
mlog_errno(status);
goto bail;
@@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* before our is-it-in-flight check.
*/
- if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
+ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
mlog(ML_UPTODATE,
- "bh (%llu), inode %llu not uptodate\n",
+ "bh (%llu), owner %llu not uptodate\n",
(unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* We're using ignore_cache here to say
* "go to disk" */
ignore_cache = 1;
@@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* previously submitted request than we are
* done here. */
if ((flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_read_ahead(inode, bh))
+ && ocfs2_buffer_read_ahead(ci, bh))
continue;
lock_buffer(bh);
@@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* buffer lock. */
if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_uptodate(inode, bh)) {
+ && ocfs2_buffer_uptodate(ci, bh)) {
unlock_buffer(bh);
continue;
}
@@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
if (!(flags & OCFS2_BH_READAHEAD)) {
/* We know this can't have changed as we hold the
- * inode sem. Avoid doing any work on the bh if the
+ * owner sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* that better not have changed */
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
- status = validate(inode->i_sb, bh);
+ status = validate(sb, bh);
if (status) {
put_bh(bh);
bhs[i] = NULL;
@@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
(unsigned long long)block, nr,
@@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb,
/*
* Write super block and backups doesn't need to collaborate with journal,
- * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
* into this function.
*/
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c75d682dadd8..b97bcc6dde7c 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
- struct inode *inode);
+ struct ocfs2_caching_info *ci);
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[]);
@@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* be set even for a READAHEAD call, as it marks the buffer for later
* validation.
*/
-int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
@@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
-static inline int ocfs2_read_block(struct inode *inode, u64 off,
+static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
@@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
+ status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
bail:
return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
.open = o2hb_debug_open,
.release = o2hb_debug_release,
.read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df5416993e..1cd2934de615 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(EXPORT),
define_mask(XATTR),
define_mask(QUOTA),
+ define_mask(REFCOUNT),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 696c32e50716..9b4d11726cf2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
+#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa07..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations nst_seq_ops = {
+static const struct seq_operations nst_seq_ops = {
.start = nst_seq_start,
.next = nst_seq_next,
.stop = nst_seq_stop,
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
.open = nst_fop_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations sc_seq_ops = {
+static const struct seq_operations sc_seq_ops = {
.start = sc_seq_start,
.next = sc_seq_next,
.stop = sc_seq_stop,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations sc_seq_fops = {
+static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b358f3bf896d..28c3ec238796 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dir_block_trailer *trailer;
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
int ret;
struct buffer_head *tmp = *bh;
- ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+ ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
+ ocfs2_validate_dir_block);
if (ret) {
mlog_errno(ret);
goto out;
@@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
u64 blkno = le64_to_cpu(di->i_dx_root);
struct buffer_head *tmp = *dx_root_bh;
- ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_root);
/* If ocfs2_read_block() got us a new bh, pass it up. */
if (!ret && !*dx_root_bh)
@@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
int ret;
struct buffer_head *tmp = *dx_leaf_bh;
- ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_leaf);
/* If ocfs2_read_block() got us a new bh, pass it up. */
if (!ret && !*dx_leaf_bh)
@@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
{
int ret;
- ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+ ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
ocfs2_validate_dx_leaf);
if (ret)
mlog_errno(ret);
@@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
struct ocfs2_extent_rec *rec = NULL;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
access = ocfs2_journal_access_di;
- ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = access(handle, INODE_CACHE(dir), de_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
goto bail;
}
if (de == de_del) {
- status = access(handle, dir, bh,
+ status = access(handle, INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
status = -EIO;
@@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
* the entry count needs to be updated. Also, we might be
* adding to the start of the free list.
*/
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
}
if (!ocfs2_dx_root_inline(dx_root)) {
- ret = ocfs2_journal_access_dl(handle, dir,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
lookup->dl_dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
int ret;
struct ocfs2_dx_leaf *dx_leaf;
- ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
struct ocfs2_dx_root_block *dx_root;
struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle,
*/
if (ocfs2_free_list_at_root(lookup)) {
bh = lookup->dl_dx_root_bh;
- retval = ocfs2_journal_access_dr(handle, dir, bh,
+ retval = ocfs2_journal_access_dr(handle,
+ INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
} else {
bh = lookup->dl_prev_leaf_bh;
- retval = ocfs2_journal_access_db(handle, dir, bh,
+ retval = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
}
if (retval) {
@@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle,
}
if (insert_bh == parent_fe_bh)
- status = ocfs2_journal_access_di(handle, dir,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
else {
- status = ocfs2_journal_access_db(handle, dir,
+ status = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
struct ocfs2_inline_data *data = &di->id2.i_data;
unsigned int size = le16_to_cpu(data->id_count);
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
- status = ocfs2_journal_access_db(handle, inode, new_bh,
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
ret = -EIO;
goto out;
}
- ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
@@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
if (ret)
mlog_errno(ret);
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
}
dx_leaves[i] = bh;
- ocfs2_set_new_buffer_uptodate(dir, bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
- ret = ocfs2_journal_access_dl(handle, dir, bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
@@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
{
int ret;
u64 phys_blkno;
- struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
num_dx_leaves, &phys_blkno);
@@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
goto out;
}
- ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+ ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
meta_ac);
if (ret)
mlog_errno(ret);
@@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_extent_tree dx_et;
int did_quota = 0, bytes_allocated = 0;
- ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
alloc = ocfs2_clusters_for_bytes(sb, bytes);
dx_alloc = 0;
@@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out_commit;
}
- ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
- ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+ ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We let the later dirent insert modify c/mtime - to the user
* the data hasn't changed.
*/
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* This should never fail as our extent list is empty and all
* related blocks have been journaled already.
*/
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+ ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
0, NULL);
if (ret) {
mlog_errno(ret);
@@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
dirdata_bh);
} else {
- ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
- ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+ ocfs2_init_dx_root_extent_tree(&dx_et,
+ INODE_CACHE(dir),
+ dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &dx_et, 0,
dx_insert_blkno, 1, 0, NULL);
if (ret)
mlog_errno(ret);
@@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
}
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+ ret = ocfs2_insert_extent(handle, &et, 1,
blkno, len, 0, NULL);
if (ret) {
mlog_errno(ret);
@@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
- ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
- num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
+ parent_fe_bh);
+ num_free_extents = ocfs2_num_free_extents(osb, &et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -3387,9 +3398,9 @@ do_extend:
goto bail;
}
- ocfs2_set_new_buffer_uptodate(dir, new_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
- status = ocfs2_journal_access_db(handle, dir, new_bh,
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)leaf_blkno, insert_hash);
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
/*
@@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
}
did_quota = 1;
- ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
}
for (i = 0; i < num_dx_leaves; i++) {
- ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ orig_dx_leaves[i],
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
* failure to add the dx_root_bh to the journal won't result
* us losing clusters.
*/
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
/* This should never fail considering we start with an empty
* dx_root. */
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
- insert_blkno, 1, 0, NULL);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
if (ret)
mlog_errno(ret);
did_quota = 0;
@@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
goto out_unlock;
}
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
if (ocfs2_dx_root_inline(dx_root))
goto remove_index;
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
/* XXX: What if dr_clusters is too large? */
while (le32_to_cpu(dx_root->dr_clusters)) {
@@ -4565,7 +4576,7 @@ remove_index:
goto out;
}
- ocfs2_remove_from_cache(dir, dx_root_bh);
+ ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
out:
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f669..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
@@ -479,7 +478,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
.open = debug_purgelist_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
@@ -539,7 +538,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
.open = debug_mle_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
@@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
return 0;
}
-static struct seq_operations debug_lockres_ops = {
+static const struct seq_operations debug_lockres_ops = {
.start = lockres_seq_start,
.stop = lockres_seq_stop,
.next = lockres_seq_next,
@@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
.open = debug_lockres_open,
.release = debug_lockres_release,
.read = seq_read,
@@ -926,7 +925,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
.open = debug_state_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
}
static struct backing_dev_info dlmfs_backing_dev_info = {
+ .name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d490b66ad9d7..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
}
+ spin_lock(&res->spinlock);
if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purgelist, "
"master = %d\n", res->lockname.len, res->lockname.name,
res, master);
list_del_init(&res->purge);
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
dlm->purge_count--;
- }
+ } else
+ spin_unlock(&res->spinlock);
+
__dlm_unhash_lockres(res);
/* lockres is not in the hash now. drop the flag and wake up
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 110bb57c46ab..0d38d67194cb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
#include "super.h"
#include "uptodate.h"
#include "quota.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking);
+
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
/* This aids in debugging situations where a bad LVB might be involved. */
@@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
};
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+ .check_downconvert = ocfs2_check_refcount_downconvert,
+ .downconvert_worker = ocfs2_refcount_convert_worker,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
}
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+ return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
{
if (lockres->l_ops->get_osb)
@@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
info);
}
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation)
+{
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+ generation, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+ &ocfs2_refcount_block_lops, osb);
+}
+
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write)
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
- if (ocfs2_mount_local(osb))
+ if (ocfs2_mount_local(osb)) {
+ mlog_exit(0);
return 0;
+ }
lockres = &OCFS2_I(inode)->ip_rw_lockres;
@@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
/* This will discard any caching information we might have had
* for the inode metadata. */
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_purge(INODE_CACHE(inode));
ocfs2_extent_map_trunc(inode, 0);
@@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error)
"unlock_action %d\n", error, lockres->l_name,
lockres->l_unlock_action);
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ mlog_exit_void();
return;
}
@@ -3495,11 +3527,11 @@ out:
return UNBLOCK_CONTINUE;
}
-static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
- int new_level)
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+ struct ocfs2_lock_res *lockres,
+ int new_level)
{
- struct inode *inode = ocfs2_lock_res_inode(lockres);
- int checkpointed = ocfs2_inode_fully_checkpointed(inode);
+ int checkpointed = ocfs2_ci_fully_checkpointed(ci);
BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
if (checkpointed)
return 1;
- ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
+ ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
return 0;
}
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+ return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
{
struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+ return UNBLOCK_CONTINUE;
+}
+
static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
{
struct ocfs2_qinfo_lvb *lvb;
@@ -3752,6 +3812,37 @@ bail:
return status;
}
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+ int status;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+ return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
/*
* This is the filesystem locking protocol. It provides the lock handling
* hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 7553836931de..d1ce48e1b3d6 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo;
void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
@@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2bb1a04d253..843db64e9d4a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
* eb_bh is NULL. Otherwise, eb_bh should point to the extent block
* containing el.
*/
-static int ocfs2_figure_hole_clusters(struct inode *inode,
- struct ocfs2_extent_list *el,
- struct buffer_head *eb_bh,
- u32 v_cluster,
- u32 *num_clusters)
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters)
{
int ret, i;
struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(ci,
le64_to_cpu(eb->h_next_leaf_blk),
&next_eb_bh);
if (ret) {
@@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
tree_height = le16_to_cpu(el->l_tree_depth);
if (tree_height > 0) {
- ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
* field.
*/
if (hole_len) {
- ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+ ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+ el, eb_bh,
v_cluster, &len);
if (ret) {
mlog_errno(ret);
@@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
- struct ocfs2_extent_list *el)
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags)
{
int ret = 0, i;
struct buffer_head *eb_bh = NULL;
@@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 coff;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+ if (extent_flags)
+ *extent_flags = rec->e_flags;
}
out:
if (eb_bh)
@@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
}
- rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
- flags, validate);
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
+ bhs + done, flags, validate);
if (rc) {
mlog_errno(rc);
break;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd9731b462..e79d41c2c909 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
- struct ocfs2_extent_list *el);
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags);
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters);
static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..3d30a1c974a8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
#include "xattr.h"
#include "acl.h"
#include "quota.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -334,6 +335,39 @@ out:
return ret;
}
+static int ocfs2_cow_file_pos(struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 offset)
+{
+ int status;
+ u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ /*
+ * If the new offset is aligned to the range of the cluster, there is
+ * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+ * CoW either.
+ */
+ if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+ return 0;
+
+ status = ocfs2_get_clusters(inode, cpos, &phys,
+ &num_clusters, &ext_flags);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+
+out:
+ return status;
+}
+
static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
mlog_entry_void();
+ /*
+ * We need to CoW the cluster contains the offset if it is reflinked
+ * since we will call ocfs2_zero_range_for_truncate later which will
+ * write "0" from offset to the end of the cluster.
+ */
+ status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
+
/* TODO: This needs to actually orphan the inode in this
* transaction. */
@@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -486,6 +531,8 @@ bail_unlock_sem:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail:
+ if (!status && OCFS2_I(inode)->ip_clusters == 0)
+ status = ocfs2_try_remove_refcount_tree(inode, di_bh);
mlog_exit(status);
return status;
@@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ret;
struct ocfs2_extent_tree et;
- ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
- ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
- clusters_to_add, mark_unwritten,
- &et, handle,
- data_ac, meta_ac, reason_ret);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
+ ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
return ret;
}
@@ -564,7 +610,7 @@ restart_all:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
clusters_to_add);
- ocfs2_init_dinode_extent_tree(&et, inode, bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
&data_ac, &meta_ac);
if (status) {
@@ -593,7 +639,7 @@ restarted_transaction:
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
* update i_size. */
- status = ocfs2_journal_access_di(handle, inode, bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
struct address_space *mapping = inode->i_mapping;
struct ocfs2_extent_tree et;
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
if (byte_len == 0)
@@ -1657,6 +1703,71 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
OCFS2_IOC_RESVSP64, &sr, change_size);
}
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+ !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ loff_t pos, size_t count,
+ int *meta_level)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *meta_level = 1;
+
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(di_bh);
+ return ret;
+}
+
static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
loff_t *ppos,
size_t count,
@@ -1713,6 +1824,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
end = saved_pos + count;
+ ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+ if (ret == 1) {
+ ocfs2_inode_unlock(inode, meta_level);
+ meta_level = -1;
+
+ ret = ocfs2_prepare_inode_for_refcount(inode,
+ saved_pos,
+ count,
+ &meta_level);
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
/*
* Skip the O_DIRECT checks if we don't need
* them.
@@ -1759,7 +1886,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
*ppos = saved_pos;
out_unlock:
- ocfs2_inode_unlock(inode, meta_level);
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
@@ -1871,27 +1999,29 @@ relock:
goto out_dio;
}
} else {
- written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
- *ppos);
+ written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
}
out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
- if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
- /*
- * The generic write paths have handled getting data
- * to disk, but since we don't make use of the dirty
- * inode list, a manual journal commit is necessary
- * here.
- */
- if (old_size != i_size_read(inode) ||
- old_clusters != OCFS2_I(inode)->ip_clusters) {
+ if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) {
+ ret = filemap_fdatawrite_range(file->f_mapping, pos,
+ pos + count - 1);
+ if (ret < 0)
+ written = ret;
+
+ if (!ret && (old_size != i_size_read(inode) ||
+ old_clusters != OCFS2_I(inode)->ip_clusters)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
+
+ if (!ret)
+ ret = filemap_fdatawait_range(file->f_mapping, pos,
+ pos + count - 1);
}
/*
@@ -1991,31 +2121,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
if (ret > 0) {
unsigned long nr_pages;
+ int err;
- *ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- mutex_lock(&inode->i_mutex);
- err = ocfs2_rw_lock(inode, 1);
- if (err < 0) {
- mlog_errno(err);
- } else {
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
+ err = generic_write_sync(out, *ppos, ret);
+ if (err)
+ ret = err;
+ else
+ *ppos += ret;
- if (err)
- ret = err;
- }
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fbc9fc7..d66cf4f7c70e 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr);
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4dc8890ba316..0297fb8982b8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "sysfile.h"
#include "uptodate.h"
#include "xattr.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
/* set the inodes dtime */
- status = ocfs2_journal_access_di(handle, inode, di_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_commit;
}
- ocfs2_remove_from_cache(inode, di_bh);
+ ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
vfs_dq_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
@@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
goto bail_unlock_dir;
}
+ status = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
orphan_dir_bh);
if (status < 0)
@@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_inode_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_exit(INODE_CACHE(inode));
- mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+ mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
"Clear inode of %llu, inode has %u cache items\n",
- (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+ (unsigned long long)oi->ip_blkno,
+ INODE_CACHE(inode)->ci_num_cached);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+ mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
"Clear inode of %llu, inode has a bad flag\n",
(unsigned long long)oi->ip_blkno);
@@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode)
(unsigned long long)oi->ip_blkno, oi->ip_open_count);
/* Clear all other flags. */
- oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
+ oi->ip_flags = 0;
oi->ip_dir_start_lookup = 0;
oi->ip_blkno = 0ULL;
@@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
mlog_entry("(inode %llu)\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_journal_access_di(handle, inode, bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
- flags, ocfs2_validate_inode_block);
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags, ocfs2_validate_inode_block);
/* If ocfs2_read_blocks() got us a new bh, pass it up. */
if (!rc && !*bh)
@@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
{
return ocfs2_read_inode_block_full(inode, bh, 0);
}
+
+
+static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->ip_blkno;
+}
+
+static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->vfs_inode.i_sb;
+}
+
+static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_lock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_lock(&oi->ip_io_mutex);
+}
+
+static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_unlock(&oi->ip_io_mutex);
+}
+
+const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
+ .co_owner = ocfs2_inode_cache_owner,
+ .co_get_super = ocfs2_inode_cache_get_super,
+ .co_cache_lock = ocfs2_inode_cache_lock,
+ .co_cache_unlock = ocfs2_inode_cache_unlock,
+ .co_io_lock = ocfs2_inode_cache_io_lock,
+ .co_io_unlock = ocfs2_inode_cache_io_unlock,
+};
+
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ea71525aad41..ba4fe07b293c 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -60,12 +60,6 @@ struct ocfs2_inode_info
u32 ip_dir_start_lookup;
- /* next two are protected by trans_inc_lock */
- /* which transaction were we created on? Zero if none. */
- unsigned long ip_created_trans;
- /* last transaction we were a part of. */
- unsigned long ip_last_trans;
-
struct ocfs2_caching_info ip_metadata_cache;
struct ocfs2_extent_map ip_extent_map;
@@ -106,8 +100,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
/* Does someone have the file open O_DIRECT */
#define OCFS2_INODE_OPEN_DIRECT 0x00000040
-/* Indicates that the metadata cache should be used as an array. */
-#define OCFS2_INODE_CACHE_INLINE 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
@@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
+extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
+
+static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
+{
+ return &OCFS2_I(inode)->ip_metadata_cache;
+}
void ocfs2_clear_inode(struct inode *inode);
void ocfs2_delete_inode(struct inode *inode);
@@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
/* The same, but can be passed OCFS2_BH_* flags */
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags);
+
+static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
+}
+
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 467b413bec21..31fbb0619510 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -21,6 +21,7 @@
#include "ocfs2_fs.h"
#include "ioctl.h"
#include "resize.h"
+#include "refcounttree.h"
#include <linux/ext2_fs.h>
@@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int status;
struct ocfs2_space_resv sr;
struct ocfs2_new_group_input input;
+ struct reflink_arguments args;
+ const char *old_path, *new_path;
+ bool preserve;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
return ocfs2_group_add(inode, &input);
+ case OCFS2_IOC_REFLINK:
+ if (copy_from_user(&args, (struct reflink_arguments *)arg,
+ sizeof(args)))
+ return -EFAULT;
+ old_path = (const char *)(unsigned long)args.old_path;
+ new_path = (const char *)(unsigned long)args.new_path;
+ preserve = (args.preserve != 0);
+
+ return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
default:
return -ENOTTY;
}
@@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ case OCFS2_IOC_REFLINK:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c48b93ac6b65..54c16b66327e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -48,6 +48,7 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "quota.h"
#include "buffer_head_io.h"
@@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = {
.ot_offset = offsetof(struct ocfs2_extent_block, h_check),
};
+static struct ocfs2_triggers rb_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
static struct ocfs2_triggers gd_triggers = {
.ot_triggers = {
.t_commit = ocfs2_commit_trigger,
@@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = {
};
static int __ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh,
struct ocfs2_triggers *triggers,
int type)
{
int status;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
- BUG_ON(!inode);
+ BUG_ON(!ci || !ci->ci_ops);
BUG_ON(!handle);
BUG_ON(!bh);
@@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle,
BUG();
}
- /* Set the current transaction information on the inode so
+ /* Set the current transaction information on the ci so
* that the locking code knows whether it can drop it's locks
- * on this inode or not. We're protected from the commit
+ * on this ci or not. We're protected from the commit
* thread updating the current transaction id until
* ocfs2_commit_trans() because ocfs2_start_trans() took
* j_trans_barrier for us. */
- ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+ ocfs2_set_ci_lock_trans(osb->journal, ci);
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
switch (type) {
case OCFS2_JOURNAL_ACCESS_CREATE:
case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle,
status = -EINVAL;
mlog(ML_ERROR, "Uknown access type!\n");
}
- if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+ if (!status && ocfs2_meta_ecc(osb) && triggers)
jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
if (status < 0)
mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle,
return status;
}
-int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, int type)
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
}
-int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
}
-int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+ return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
type);
}
-int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
}
-int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
}
-int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
}
-int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
}
-int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
}
-int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+ return __ocfs2_journal_access(handle, ci, bh, NULL, type);
}
int ocfs2_journal_dirty(handle_t *handle,
@@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
ocfs2_bump_recovery_generation(fe);
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
- status = ocfs2_write_block(osb, bh, journal->j_inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
if (status < 0)
mlog_errno(status);
@@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
ocfs2_get_recovery_generation(fe);
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
- status = ocfs2_write_block(osb, bh, inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2c3222aec622..3f74e09b0d80 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
return old_id;
}
-static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
- struct inode *inode)
+static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
+ struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+ ci->ci_last_trans = journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Used to figure out whether it's safe to drop a metadata lock on an
- * inode. Returns true if all the inodes changes have been
+ * cached object. Returns true if all the object's changes have been
* checkpointed to disk. You should be holding the spinlock on the
* metadata lock while calling this to be sure that nobody can take
* the lock and put it on another transaction. */
-static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
{
int ret;
- struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
spin_lock(&trans_inc_lock);
- ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+ ret = time_after(journal->j_trans_id, ci->ci_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
-/* convenience function to check if an inode is still new (has never
- * hit disk) Will do you a favor and set created_trans = 0 when you've
- * been checkpointed. returns '1' if the inode is still new. */
-static inline int ocfs2_inode_is_new(struct inode *inode)
+/* convenience function to check if an object backed by struct
+ * ocfs2_caching_info is still new (has never hit disk) Will do you a
+ * favor and set created_trans = 0 when you've
+ * been checkpointed. returns '1' if the ci is still new. */
+static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
{
int ret;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+ spin_lock(&trans_inc_lock);
+ ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
+ if (!ret)
+ ci->ci_created_trans = 0;
+ spin_unlock(&trans_inc_lock);
+ return ret;
+}
+
+/* Wrapper for inodes so we can check system files */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
/* System files are never "new" as they're written out by
* mkfs. This helps us early during mount, before we have the
* journal open and j_trans_id could be junk. */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
return 0;
- spin_lock(&trans_inc_lock);
- ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
- OCFS2_I(inode)->ip_created_trans));
- if (!ret)
- OCFS2_I(inode)->ip_created_trans = 0;
- spin_unlock(&trans_inc_lock);
- return ret;
+
+ return ocfs2_ci_is_new(INODE_CACHE(inode));
}
-static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
- struct inode *inode)
+static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
+ struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+ ci->ci_created_trans = osb->journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
@@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
if (ocfs2_mount_local(osb))
return;
- if (!ocfs2_inode_fully_checkpointed(inode)) {
+ if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
@@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
ocfs2_start_checkpoint(osb);
wait_event(osb->journal->j_checkpointed,
- ocfs2_inode_fully_checkpointed(inode));
+ ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
}
}
@@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
/* ocfs2_inode */
-int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_extent_block */
-int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_group_desc */
-int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_xattr_block */
-int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* quota blocks */
-int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* dirblock */
-int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_root_block */
-int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_leaf */
-int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* Anything that has no ecc */
-int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/*
@@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
return credits;
}
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+ + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
/*
* Please note that the caller must make sure that root_el is the root
* of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index bac7e6abaf47..ac10f83edb95 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
memcpy(alloc_copy, alloc, bh->b_size);
- status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
ocfs2_clear_local_alloc(alloc);
ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
- status = ocfs2_write_block(osb, alloc_bh, inode);
+ status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
@@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
* delete bits from it! */
*num_bits = bits_wanted;
- status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
@@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
- status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72ec..39737613424a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
return ret;
}
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
};
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8601f934010b..f010b22b1c44 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
struct inode *inode,
- struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
- struct inode *inode,
+ u64 blkno,
char *name,
struct ocfs2_dir_lookup_result *lookup);
@@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir,
}
did_quota_inode = 1;
+ mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+ inode->i_mode, (unsigned long)dev, dentry->d_name.len,
+ dentry->d_name.name);
+
/* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
+ status = ocfs2_mknod_locked(osb, dir, inode, dev,
&new_fe_bh, parent_fe_bh, handle,
inode_ac);
if (status < 0) {
@@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
+ parent_fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -465,7 +469,6 @@ leave:
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
struct inode *inode,
- struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
@@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
u16 suballoc_bit;
u16 feat;
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
- inode->i_mode, (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
-
*new_fe_bh = NULL;
status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
mlog_errno(status);
goto leave;
}
- ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
- status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ *new_fe_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
}
ocfs2_populate_inode(inode, fe, 1);
- ocfs2_inode_set_new(osb, inode);
+ ocfs2_ci_set_new(osb, INODE_CACHE(inode));
if (!ocfs2_mount_local(osb)) {
status = ocfs2_create_new_inode_locks(inode);
if (status < 0)
@@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- err = ocfs2_journal_access_di(handle, inode, fe_bh,
+ err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
mlog_errno(err);
@@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir,
}
if (inode_is_unlinkable(inode)) {
- status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+ OCFS2_I(inode)->ip_blkno,
orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
@@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir,
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
- new_inode,
- orphan_name,
- &orphan_insert);
+ OCFS2_I(new_inode)->ip_blkno,
+ orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
}
- status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
+ newfe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir,
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
+ old_inode_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir,
(int)old_dir_nlink, old_dir->i_nlink);
} else {
struct ocfs2_dinode *fe;
- status = ocfs2_journal_access_di(handle, old_dir,
- old_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(old_dir),
+ old_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
+ bhs[virtual]);
- status = ocfs2_journal_access(handle, inode, bhs[virtual],
+ status = ocfs2_journal_access(handle, INODE_CACHE(inode),
+ bhs[virtual],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir,
}
did_quota_inode = 1;
- status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+ mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
+ inode->i_mode, dentry->d_name.len,
+ dentry->d_name.name);
+
+ status = ocfs2_mknod_locked(osb, dir, inode,
0, &new_fe_bh, parent_fe_bh, handle,
inode_ac);
if (status < 0) {
@@ -1842,7 +1851,7 @@ bail:
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
- struct inode *inode,
+ u64 blkno,
char *name,
struct ocfs2_dir_lookup_result *lookup)
{
@@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+ status = ocfs2_blkno_stringify(blkno, name);
if (status < 0) {
mlog_errno(status);
return status;
@@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -2028,6 +2041,274 @@ leave:
return status;
}
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode)
+{
+ int status, did_quota_inode = 0;
+ struct inode *inode = NULL;
+ struct inode *orphan_dir = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *di = NULL;
+ handle_t *handle = NULL;
+ char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+ struct buffer_head *parent_di_bh = NULL;
+ struct buffer_head *new_di_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ /*
+ * We give the orphan dir the root blkno to fake an orphan name,
+ * and allocate enough space for our insertion.
+ */
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+ osb->root_blkno,
+ orphan_name, &orphan_insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* reserve an inode spot */
+ status = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+ * to be called. */
+ if (sb_any_quota_active(osb->sb) &&
+ osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+ status = -EDQUOT;
+ goto leave;
+ }
+ did_quota_inode = 1;
+
+ /* do the real work now. */
+ status = ocfs2_mknod_locked(osb, dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ di = (struct ocfs2_dinode *)new_di_bh->b_data;
+ status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* get open lock so that only nodes can't remove it from orphan dir. */
+ status = ocfs2_open_lock(inode);
+ if (status < 0)
+ mlog_errno(status);
+
+leave:
+ if (status < 0 && did_quota_inode)
+ vfs_dq_free_inode(inode);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
+
+ if (orphan_dir) {
+ /* This was locked for us in ocfs2_prepare_orphan_dir() */
+ ocfs2_inode_unlock(orphan_dir, 1);
+ mutex_unlock(&orphan_dir->i_mutex);
+ iput(orphan_dir);
+ }
+
+ if (status == -ENOSPC)
+ mlog(0, "Disk is full\n");
+
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
+ iput(inode);
+ }
+
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ brelse(new_di_bh);
+
+ if (!status)
+ *new_inode = inode;
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *inode,
+ struct dentry *dentry)
+{
+ int status = 0;
+ struct buffer_head *parent_di_bh = NULL;
+ handle_t *handle = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *dir_di, *di;
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+ mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
+ dentry->d_name.len, dentry->d_name.name);
+
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ if (!dir_di->i_links_count) {
+ /* can't make a file in a deleted directory. */
+ status = -ENOENT;
+ goto leave;
+ }
+
+ status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+ dentry->d_name.len);
+ if (status)
+ goto leave;
+
+ /* get a spot inside the dir. */
+ status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ dentry->d_name.name,
+ dentry->d_name.len, &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ osb->slot_num);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ mutex_lock(&orphan_dir_inode->i_mutex);
+
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+ goto leave;
+ }
+
+ status = ocfs2_read_inode_block(inode, &di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto orphan_unlock;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto orphan_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+ di->i_orphaned_slot = 0;
+ ocfs2_journal_dirty(handle, di_bh);
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ status = ocfs2_dentry_attach_lock(dentry, inode,
+ OCFS2_I(dir)->ip_blkno);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ insert_inode_hash(inode);
+ dentry->d_op = &ocfs2_dentry_ops;
+ d_instantiate(dentry, inode);
+ status = 0;
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+leave:
+
+ ocfs2_inode_unlock(dir, 1);
+
+ brelse(di_bh);
+ brelse(parent_di_bh);
+ brelse(orphan_dir_bh);
+
+ ocfs2_free_dir_lookup_result(&lookup);
+
+ mlog_exit(status);
+
+ return status;
+}
+
const struct inode_operations ocfs2_dir_iops = {
.create = ocfs2_create,
.lookup = ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef64c879..e5d059d4f115 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct inode *orphan_dir_inode,
struct inode *inode,
struct buffer_head *orphan_dir_bh);
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *new_inode,
+ struct dentry *new_dentry);
#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 39e1d5a39505..9362eea7424b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -35,12 +35,7 @@
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/lockdep.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
@@ -51,20 +46,51 @@
/* For struct ocfs2_blockcheck_stats */
#include "blockcheck.h"
+
+/* Caching of metadata buffers */
+
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
* amount of inlined blocks to be stored on an array and grow the
* structure into a rb tree when necessary. */
-#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+#define OCFS2_CACHE_INFO_MAX_ARRAY 2
+
+/* Flags for ocfs2_caching_info */
+
+enum ocfs2_caching_info_flags {
+ /* Indicates that the metadata cache is using the inline array */
+ OCFS2_CACHE_FL_INLINE = 1<<1,
+};
+struct ocfs2_caching_operations;
struct ocfs2_caching_info {
+ /*
+ * The parent structure provides the locks, but because the
+ * parent structure can differ, it provides locking operations
+ * to struct ocfs2_caching_info.
+ */
+ const struct ocfs2_caching_operations *ci_ops;
+
+ /* next two are protected by trans_inc_lock */
+ /* which transaction were we created on? Zero if none. */
+ unsigned long ci_created_trans;
+ /* last transaction we were a part of. */
+ unsigned long ci_last_trans;
+
+ /* Cache structures */
+ unsigned int ci_flags;
unsigned int ci_num_cached;
union {
- sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+ sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
struct rb_root ci_tree;
} ci_cache;
};
+/*
+ * Need this prototype here instead of in uptodate.h because journal.h
+ * uses it.
+ */
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
@@ -219,9 +245,11 @@ enum ocfs2_mount_options
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
- OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
- OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
- OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
+ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
+ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
+ control lists */
+ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -377,12 +405,17 @@ struct ocfs2_super
/* the group we used to allocate inodes. */
u64 osb_inode_alloc_group;
+
+ /* rb tree root for refcount lock. */
+ struct rb_root osb_rf_lock_tree;
+ struct ocfs2_refcount_tree *osb_ref_tree_lru;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
/* Useful typedef for passing around journal access functions */
-typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+typedef int (*ocfs2_journal_access_func)(handle_t *handle,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
static inline int ocfs2_should_order_data(struct inode *inode)
@@ -480,6 +513,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
ocfs2_set_links_count(di, links);
}
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -578,6 +618,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
#define OCFS2_IS_VALID_DX_LEAF(ptr) \
(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
+ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e5e77c..e9431e4a5e7c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -98,7 +99,8 @@
| OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
| OCFS2_FEATURE_INCOMPAT_XATTR \
| OCFS2_FEATURE_INCOMPAT_META_ECC \
- | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -160,6 +162,9 @@
/* Metadata checksum and error correction */
#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -223,6 +228,7 @@
#define OCFS2_HAS_XATTR_FL (0x0002)
#define OCFS2_INLINE_XATTR_FL (0x0004)
#define OCFS2_INDEXED_DIR_FL (0x0008)
+#define OCFS2_HAS_REFCOUNT_FL (0x0010)
/* Inode attributes, keep in sync with EXT2 */
#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
@@ -241,8 +247,11 @@
/*
* Extent record flags (e_node.leaf.flags)
*/
-#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
- * unwritten */
+#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
+ * unwritten */
+#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
+ * counted in an associated
+ * refcount tree */
/*
* ioctl commands
@@ -292,6 +301,15 @@ struct ocfs2_new_group_input {
#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+ __u64 old_path;
+ __u64 new_path;
+ __u64 preserve;
+};
+#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+
+
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
*/
@@ -717,7 +735,8 @@ struct ocfs2_dinode {
__le64 i_xattr_loc;
/*80*/ struct ocfs2_block_check i_check; /* Error checking */
/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
- __le64 i_reserved2[5];
+/*90*/ __le64 i_refcount_loc;
+ __le64 i_reserved2[4];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -901,6 +920,60 @@ struct ocfs2_group_desc
/*40*/ __u8 bg_bitmap[0];
};
+struct ocfs2_refcount_rec {
+/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
+ __le32 r_clusters; /* Clusters covered by this extent */
+ __le32 r_refcount; /* Reference count of this extent */
+/*10*/
+};
+#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
+
+#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/ __le16 rl_count; /* Maximum number of entries possible
+ in rl_records */
+ __le16 rl_used; /* Current number of used records */
+ __le32 rl_reserved2;
+ __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/ __u8 rf_signature[8]; /* Signature for verification */
+ __le16 rf_suballoc_slot; /* Slot suballocator this block
+ belongs to */
+ __le16 rf_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 rf_fs_generation; /* Must match superblock */
+/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
+ __le64 rf_parent; /* Parent block, only valid if
+ OCFS2_REFCOUNT_LEAF_FL is set in
+ rf_flags */
+/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
+ __le64 rf_last_eb_blk; /* Pointer to last extent block */
+/*30*/ __le32 rf_count; /* Number of inodes sharing this
+ refcount tree */
+ __le32 rf_flags; /* See the flags above */
+ __le32 rf_clusters; /* clusters covered by refcount tree. */
+ __le32 rf_cpos; /* cluster offset in refcount tree.*/
+/*40*/ __le32 rf_generation; /* generation number. all be the same
+ * for the same refcount tree. */
+ __le32 rf_reserved0;
+ __le64 rf_reserved1[7];
+/*80*/ union {
+ struct ocfs2_refcount_list rf_records; /* List of refcount
+ records */
+ struct ocfs2_extent_list rf_list; /* Extent record list,
+ only valid if
+ OCFS2_REFCOUNT_TREE_FL
+ is set in rf_flags */
+ };
+/* Actual on-disk size is one block */
+};
+
/*
* On disk extended attribute structure for OCFS2.
*/
@@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+ return size / sizeof(struct ocfs2_refcount_rec);
+}
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+ return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index c212cf5a2bdf..d277aabf5dfb 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -49,6 +49,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ OCFS2_LOCK_TYPE_REFCOUNT,
OCFS2_NUM_LOCK_TYPES
};
@@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
c = 'P';
break;
+ case OCFS2_LOCK_TYPE_REFCOUNT:
+ c = 'T';
+ break;
default:
c = '\0';
}
@@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+ [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
struct buffer_head **bh);
-extern struct dquot_operations ocfs2_quota_operations;
+extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..b437dc0c4cad 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block,
err = -EIO;
mlog_errno(err);
}
- return err;;
+ return err;
}
/* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
flush_dcache_page(bh->b_page);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ocfs2_set_buffer_uptodate(gqinode, bh);
- err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
+ err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
+ ja_type);
if (err < 0) {
brelse(bh);
goto out;
@@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
kmem_cache_free(ocfs2_dquot_cachep, dquot);
}
-struct dquot_operations ocfs2_quota_operations = {
+const struct dquot_operations ocfs2_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bdb09cb6e1fe..21f9e71223ca 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
mlog_errno(status);
return status;
}
- status = ocfs2_journal_access_dq(handle, inode, bh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
/* Release local quota file entry */
- status = ocfs2_journal_access_dq(handle, lqinode,
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(lqinode),
qbh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
mlog_errno(status);
goto out_bh;
}
- status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
goto out_trans;
}
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_new_buffer_uptodate(lqinode, bh);
- status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
mlog_errno(status);
goto out_trans;
}
- ocfs2_set_new_buffer_uptodate(lqinode, dbh);
- status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
mlog_errno(status);
goto out;
}
- ocfs2_set_new_buffer_uptodate(lqinode, bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
/* Local quota info, chunk header and the new block we initialize */
handle = ocfs2_start_trans(OCFS2_SB(sb),
@@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
goto out;
}
/* Zero created block */
- status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
goto out_trans;
}
/* Update chunk header */
- status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ chunk->qc_headerbh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
goto out;
}
- status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(sb_dqopt(sb)->files[type]),
od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1321,7 +1325,7 @@ out:
return status;
}
-static struct quota_format_ops ocfs2_format_ops = {
+static const struct quota_format_ops ocfs2_format_ops = {
.check_quota_file = ocfs2_local_check_quota_file,
.read_file_info = ocfs2_local_read_info,
.write_file_info = ocfs2_global_write_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 000000000000..3a0df7a1b810
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4372 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sort.h>
+#define MLOG_MASK_PREFIX ML_REFCOUNT
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "aops.h"
+#include "xattr.h"
+#include "namei.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+
+struct ocfs2_cow_context {
+ struct inode *inode;
+ u32 cow_start;
+ u32 cow_len;
+ struct ocfs2_extent_tree data_et;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ void *cow_object;
+ struct ocfs2_post_refcount *post_refcount;
+ int extra_credits;
+ int (*get_clusters)(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags);
+ int (*cow_duplicate_clusters)(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+};
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)bh->b_data;
+
+ mlog(0, "Validating refcount block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+
+ if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid "
+ "rf_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+ u64 rb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+ ocfs2_validate_refcount_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+ .co_owner = ocfs2_refcount_cache_owner,
+ .co_get_super = ocfs2_refcount_cache_get_super,
+ .co_cache_lock = ocfs2_refcount_cache_lock,
+ .co_cache_unlock = ocfs2_refcount_cache_unlock,
+ .co_io_lock = ocfs2_refcount_cache_io_lock,
+ .co_io_unlock = ocfs2_refcount_cache_io_unlock,
+};
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+ struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tree = NULL;
+
+ while (n) {
+ tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+ if (blkno < tree->rf_blkno)
+ n = n->rb_left;
+ else if (blkno > tree->rf_blkno)
+ n = n->rb_right;
+ else
+ return tree;
+ }
+
+ return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new)
+{
+ u64 rf_blkno = new->rf_blkno;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tmp;
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+ rf_node);
+
+ if (rf_blkno < tmp->rf_blkno)
+ p = &(*p)->rb_left;
+ else if (rf_blkno > tmp->rf_blkno)
+ p = &(*p)->rb_right;
+ else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+ (unsigned long long)rf_blkno);
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->rf_node, parent, p);
+ rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+ ocfs2_metadata_cache_exit(&tree->rf_ci);
+ ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+ ocfs2_lock_res_free(&tree->rf_lockres);
+ kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+ if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+ osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ spin_lock(&osb->osb_lock);
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+ struct ocfs2_refcount_tree *tree =
+ container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+ ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+ kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+ kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+ struct super_block *sb)
+{
+ ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+ mutex_init(&new->rf_io_mutex);
+ new->rf_sb = sb;
+ spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new,
+ u64 rf_blkno, u32 generation)
+{
+ init_rwsem(&new->rf_sem);
+ ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+ rf_blkno, generation);
+}
+
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+ struct ocfs2_refcount_tree *new;
+
+ new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->rf_blkno = rf_blkno;
+ kref_init(&new->rf_getcnt);
+ ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+ return new;
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+ struct ocfs2_refcount_tree **ret_tree)
+{
+ int ret = 0;
+ struct ocfs2_refcount_tree *tree, *new = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *ref_rb;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->osb_ref_tree_lru &&
+ osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+ tree = osb->osb_ref_tree_lru;
+ else
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ spin_unlock(&osb->osb_lock);
+
+ new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
+ if (!new) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+ /*
+ * We need the generation to create the refcount tree lock and since
+ * it isn't changed during the tree modification, we are safe here to
+ * read without protection.
+ * We also have to purge the cache after we create the lock since the
+ * refcount block may have the stale data. It can only be trusted when
+ * we hold the refcount lock.
+ */
+ ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_metadata_cache_exit(&new->rf_ci);
+ kfree(new);
+ return ret;
+ }
+
+ ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+ new->rf_generation);
+ ocfs2_metadata_cache_purge(&new->rf_ci);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ ocfs2_insert_refcount_tree(osb, new);
+
+ tree = new;
+ new = NULL;
+
+out:
+ *ret_tree = tree;
+
+ osb->osb_ref_tree_lru = tree;
+
+ spin_unlock(&osb->osb_lock);
+
+ if (new)
+ ocfs2_free_refcount_tree(new);
+
+ brelse(ref_root_bh);
+ return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ *ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ int ret;
+
+ ret = ocfs2_refcount_lock(tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rw)
+ down_write(&tree->rf_sem);
+ else
+ down_read(&tree->rf_sem);
+
+out:
+ return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really needs it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **ret_tree,
+ struct buffer_head **ref_bh)
+{
+ int ret, delete_tree = 0;
+ struct ocfs2_refcount_tree *tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+
+again:
+ ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_refcount_tree_get(tree);
+
+ ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ /*
+ * If the refcount block has been freed and re-created, we may need
+ * to recreate the refcount tree also.
+ *
+ * Here we just remove the tree from the rb-tree, and the last
+ * kref holder will unlock and delete this refcount_tree.
+ * Then we goto "again" and ocfs2_get_refcount_tree will create
+ * the new refcount tree for us.
+ */
+ if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+ if (!tree->rf_removed) {
+ ocfs2_erase_refcount_tree_from_list(osb, tree);
+ tree->rf_removed = 1;
+ delete_tree = 1;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ /*
+ * We get an extra reference when we create the refcount
+ * tree, so another put will destroy it.
+ */
+ if (delete_tree)
+ ocfs2_refcount_tree_put(tree);
+ brelse(ref_root_bh);
+ ref_root_bh = NULL;
+ goto again;
+ }
+
+ *ret_tree = tree;
+ if (ref_bh) {
+ *ref_bh = ref_root_bh;
+ ref_root_bh = NULL;
+ }
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
+ struct ocfs2_refcount_tree **ret_tree,
+ struct buffer_head **ref_bh)
+{
+ int ret;
+ u64 ref_blkno;
+
+ ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
+ rw, ret_tree, ref_bh);
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ if (rw)
+ up_write(&tree->rf_sem);
+ else
+ up_read(&tree->rf_sem);
+
+ ocfs2_refcount_unlock(tree, rw);
+ ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
+{
+ struct rb_node *node;
+ struct ocfs2_refcount_tree *tree;
+ struct rb_root *root = &osb->osb_rf_lock_tree;
+
+ while ((node = rb_last(root)) != NULL) {
+ tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+ mlog(0, "Purge tree %llu\n",
+ (unsigned long long) tree->rf_blkno);
+
+ rb_erase(&tree->rf_node, root);
+ ocfs2_free_refcount_tree(tree);
+ }
+}
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ mlog(0, "create tree for inode %lu\n", inode->i_ino);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+ if (!new_tree) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(rb, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+ rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+ rb->rf_blkno = cpu_to_le64(first_blkno);
+ rb->rf_count = cpu_to_le32(1);
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+ spin_lock(&osb->osb_lock);
+ rb->rf_generation = osb->s_next_generation++;
+ spin_unlock(&osb->osb_lock);
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(first_blkno);
+ spin_unlock(&oi->ip_lock);
+
+ mlog(0, "created tree for inode %lu, refblock %llu\n",
+ inode->i_ino, (unsigned long long)first_blkno);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ /*
+ * We have to init the tree lock here since it will use
+ * the generation number to create it.
+ */
+ new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+ new_tree->rf_generation);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+ /*
+ * We've just created a new refcount tree in this block. If
+ * we found a refcount tree on the ocfs2_super, it must be
+ * one we just deleted. We free the old tree before
+ * inserting the new tree.
+ */
+ BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+ if (tree)
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ ocfs2_insert_refcount_tree(osb, new_tree);
+ spin_unlock(&osb->osb_lock);
+ new_tree = NULL;
+ if (tree)
+ ocfs2_refcount_tree_put(tree);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (new_tree) {
+ ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+ kfree(new_tree);
+ }
+
+ brelse(new_bh);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 refcount_loc)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ le32_add_cpu(&rb->rf_count, 1);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(refcount_loc);
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+ int ret, delete_tree = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct inode *alloc_inode = NULL;
+ struct buffer_head *alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+ u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ u16 bit = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ return 0;
+
+ BUG_ON(!ref_blkno);
+ ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+ /*
+ * If we are the last user, we need to free the block.
+ * So lock the allocator ahead.
+ */
+ if (le32_to_cpu(rb->rf_count) == 1) {
+ blk = le64_to_cpu(rb->rf_blkno);
+ bit = le16_to_cpu(rb->rf_suballoc_bit);
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot));
+ if (!alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ credits += OCFS2_SUBALLOC_FREE;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = 0;
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ le32_add_cpu(&rb->rf_count , -1);
+ ocfs2_journal_dirty(handle, blk_bh);
+
+ if (!rb->rf_count) {
+ delete_tree = 1;
+ ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+ ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+ alloc_bh, bit, bg_blkno, 1);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ if (alloc_inode) {
+ ocfs2_inode_unlock(alloc_inode, 1);
+ brelse(alloc_bh);
+ }
+out_mutex:
+ if (alloc_inode) {
+ mutex_unlock(&alloc_inode->i_mutex);
+ iput(alloc_inode);
+ }
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ if (delete_tree)
+ ocfs2_refcount_tree_put(ref_tree);
+ brelse(blk_bh);
+
+ return ret;
+}
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index)
+{
+ int i = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = NULL;
+
+ for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+ rec = &rb->rf_records.rl_recs[i];
+
+ if (le64_to_cpu(rec->r_cpos) +
+ le32_to_cpu(rec->r_clusters) <= cpos)
+ continue;
+ else if (le64_to_cpu(rec->r_cpos) > cpos)
+ break;
+
+ /* ok, cpos fail in this rec. Just return. */
+ if (ret_rec)
+ *ret_rec = *rec;
+ goto out;
+ }
+
+ if (ret_rec) {
+ /* We meet with a hole here, so fake the rec. */
+ ret_rec->r_cpos = cpu_to_le64(cpos);
+ ret_rec->r_refcount = 0;
+ if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+ le64_to_cpu(rec->r_cpos) < cpos + len)
+ ret_rec->r_clusters =
+ cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+ else
+ ret_rec->r_clusters = cpu_to_le32(len);
+ }
+
+out:
+ *index = i;
+}
+
+/*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&oi->ip_xattr_sem);
+ down_write(&oi->ip_alloc_sem);
+
+ if (oi->ip_clusters)
+ goto out;
+
+ if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+ goto out;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+ ocfs2_has_inline_xattr_value_outside(inode, di))
+ goto out;
+
+ ret = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+out:
+ up_write(&oi->ip_alloc_sem);
+ up_write(&oi->ip_xattr_sem);
+ return 0;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ * and end at a small value between cpos+len and start of the next record.
+ * This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index,
+ struct buffer_head **ret_bh)
+{
+ int ret = 0, i, found;
+ u32 low_cpos;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *tmp, *rec = NULL;
+ struct ocfs2_extent_block *eb;
+ struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+ ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_root_bh;
+ get_bh(ref_root_bh);
+ return 0;
+ }
+
+ el = &rb->rf_list;
+ low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(sb,
+ "refcount tree %llu has non zero tree "
+ "depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ found = 0;
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+ found = 1;
+ break;
+ }
+ }
+
+ /* adjust len when we have ocfs2_extent_rec after it. */
+ if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
+ tmp = &el->l_recs[i+1];
+
+ if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+ len = le32_to_cpu(tmp->e_cpos) - cpos;
+ }
+
+ ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_leaf_bh;
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+ REF_CONTIG_NONE = 0,
+ REF_CONTIG_LEFT,
+ REF_CONTIG_RIGHT,
+ REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ if ((rb->rf_records.rl_recs[index].r_refcount ==
+ rb->rf_records.rl_recs[index + 1].r_refcount) &&
+ (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+ le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+ le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+ return REF_CONTIG_RIGHT;
+
+ return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+ ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+ if (index > 0) {
+ enum ocfs2_ref_rec_contig tmp;
+
+ tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+ if (tmp == REF_CONTIG_RIGHT) {
+ if (ret == REF_CONTIG_RIGHT)
+ ret = REF_CONTIG_LEFTRIGHT;
+ else
+ ret = REF_CONTIG_LEFT;
+ }
+ }
+
+ return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+ rb->rf_records.rl_recs[index+1].r_refcount);
+
+ le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+ le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+ memmove(&rb->rf_records.rl_recs[index + 1],
+ &rb->rf_records.rl_recs[index + 2],
+ sizeof(struct ocfs2_refcount_rec) *
+ (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+ memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig contig =
+ ocfs2_refcount_rec_contig(rb, index);
+
+ if (contig == REF_CONTIG_NONE)
+ return;
+
+ if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+ BUG_ON(index == 0);
+ index--;
+ }
+
+ ocfs2_rotate_refcount_rec_left(rb, index);
+
+ if (contig == REF_CONTIG_LEFTRIGHT)
+ ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
+static int ocfs2_change_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ int index, int merge, int change)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "change index %d, old count %u, change %d\n", index,
+ le32_to_cpu(rec->r_refcount), change);
+ le32_add_cpu(&rec->r_refcount, change);
+
+ if (!rec->r_refcount) {
+ if (index != le16_to_cpu(rl->rl_used) - 1) {
+ memmove(rec, rec + 1,
+ (le16_to_cpu(rl->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+ memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ }
+
+ le16_add_cpu(&rl->rl_used, -1);
+ } else if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+ if (ret)
+ mlog_errno(ret);
+out:
+ return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head **ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Initialize ocfs2_refcount_block.
+ * It should contain the same information as the old root.
+ * so just memcpy it and change the corresponding field.
+ */
+ memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_cpos = cpu_to_le32(0);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ /* Now change the root. */
+ memset(&root_rb->rf_list, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list));
+ root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+ root_rb->rf_clusters = cpu_to_le32(1);
+ root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+ root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+ root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
+ le16_to_cpu(new_rb->rf_records.rl_used));
+
+ *ref_leaf_bh = new_bh;
+ new_bh = NULL;
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+ struct ocfs2_refcount_rec *next)
+{
+ if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+ ocfs2_get_ref_rec_low_cpos(next))
+ return 1;
+
+ return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+ u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u64 l_cpos = le64_to_cpu(l->r_cpos);
+ u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+ struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+ tmp = *(struct ocfs2_refcount_rec *)l;
+ *(struct ocfs2_refcount_rec *)l =
+ *(struct ocfs2_refcount_rec *)r;
+ *(struct ocfs2_refcount_rec *)r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ * So just try the middle pos first, and we will exit when we find
+ * the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+ u32 *split_pos, int *split_index)
+{
+ int num_used = le16_to_cpu(rl->rl_used);
+ int delta, middle = num_used / 2;
+
+ for (delta = 0; delta < middle; delta++) {
+ /* Let's check delta earlier than middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle - delta - 1],
+ &rl->rl_recs[middle - delta])) {
+ *split_index = middle - delta;
+ break;
+ }
+
+ /* For even counts, don't walk off the end */
+ if ((middle + delta + 1) == num_used)
+ continue;
+
+ /* Now try delta past middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle + delta],
+ &rl->rl_recs[middle + delta + 1])) {
+ *split_index = middle + delta + 1;
+ break;
+ }
+ }
+
+ if (delta >= middle)
+ return -ENOSPC;
+
+ *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+ return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+ struct buffer_head *new_bh,
+ u32 *split_cpos)
+{
+ int split_index = 0, num_moved, ret;
+ u32 cpos = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_block *new_rb =
+ (struct ocfs2_refcount_block *)new_bh->b_data;
+ struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+ mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+
+ /*
+ * XXX: Improvement later.
+ * If we know all the high 32 bit cpos is the same, no need to sort.
+ *
+ * In order to make the whole process safe, we do:
+ * 1. sort the entries by their low 32 bit cpos first so that we can
+ * find the split cpos easily.
+ * 2. call ocfs2_insert_extent to insert the new refcount block.
+ * 3. move the refcount rec to the new block.
+ * 4. sort the entries by their 64 bit cpos.
+ * 5. dirty the new_rb and rb.
+ */
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+ ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ new_rb->rf_cpos = cpu_to_le32(cpos);
+
+ /* move refcount records starting from split_index to the new block. */
+ num_moved = le16_to_cpu(rl->rl_used) - split_index;
+ memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /*ok, remove the entries we just moved over to the other block. */
+ memset(&rl->rl_recs[split_index], 0,
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /* change old and new rl_used accordingly. */
+ le16_add_cpu(&rl->rl_used, -num_moved);
+ new_rl->rl_used = cpu_to_le32(num_moved);
+
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ *split_cpos = cpos;
+ return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got, new_cpos;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_extent_tree ref_et;
+
+ BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(new_rb, 0, sb->s_blocksize);
+ strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ new_rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ new_rb->rf_generation = root_rb->rf_generation;
+
+ ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+ mlog(0, "insert new leaf block %llu at %u\n",
+ (unsigned long long)new_bh->b_blocknr, new_cpos);
+
+ /* Insert the new leaf block with the specific offset cpos. */
+ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+ 1, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct buffer_head *expand_bh = NULL;
+
+ if (ref_root_bh == ref_leaf_bh) {
+ /*
+ * the old root bh hasn't been expanded to a b-tree,
+ * so expand it first.
+ */
+ ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+ &expand_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ expand_bh = ref_leaf_bh;
+ get_bh(expand_bh);
+ }
+
+
+ /* Now add a new refcount block into the tree.*/
+ ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+ expand_bh, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(expand_bh);
+ return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec)
+{
+ int ret = 0, i;
+ u32 new_cpos, old_cpos;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct ocfs2_extent_list *el;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ old_cpos = le32_to_cpu(rb->rf_cpos);
+ new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+ if (old_cpos <= new_cpos)
+ goto out;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, path, old_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * 2 more credits, one for the leaf refcount block, one for
+ * the extent block contains the extent rec.
+ */
+ ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* change the leaf extent block first. */
+ el = path_leaf_el(path);
+
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+ if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+ break;
+
+ BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+ el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+ /* change the r_cpos in the leaf block. */
+ rb->rf_cpos = cpu_to_le32(new_cpos);
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ if (rf_list->rl_used == rf_list->rl_count) {
+ u64 cpos = le64_to_cpu(rec->r_cpos);
+ u32 len = le32_to_cpu(rec->r_clusters);
+
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, NULL, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (index < le16_to_cpu(rf_list->rl_used))
+ memmove(&rf_list->rl_recs[index + 1],
+ &rf_list->rl_recs[index],
+ (le16_to_cpu(rf_list->rl_used) - index) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ mlog(0, "insert refcount record start %llu, len %u, count %u "
+ "to leaf block %llu at index %d\n",
+ (unsigned long long)le64_to_cpu(rec->r_cpos),
+ le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
+ (unsigned long long)ref_leaf_bh->b_blocknr, index);
+
+ rf_list->rl_recs[index] = *rec;
+
+ le16_add_cpu(&rf_list->rl_used, 1);
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (index == 0) {
+ ret = ocfs2_adjust_refcount_rec(handle, ci,
+ ref_root_bh,
+ ref_leaf_bh, rec);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *split_rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, recs_need;
+ u32 len;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+ struct ocfs2_refcount_rec *tail_rec = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
+ le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
+ le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters));
+
+ /*
+ * If we just need to split the header or tail clusters,
+ * no more recs are needed, just split is OK.
+ * Otherwise we at least need one new recs.
+ */
+ if (!split_rec->r_refcount &&
+ (split_rec->r_cpos == orig_rec->r_cpos ||
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) ==
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need = 0;
+ else
+ recs_need = 1;
+
+ /*
+ * We need one more rec if we split in the middle and the new rec have
+ * some refcount in it.
+ */
+ if (split_rec->r_refcount &&
+ (split_rec->r_cpos != orig_rec->r_cpos &&
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) !=
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need++;
+
+ /* If the leaf block don't have enough record, expand it. */
+ if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
+ struct ocfs2_refcount_rec tmp_rec;
+ u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+ len = le32_to_cpu(orig_rec->r_clusters);
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have to re-get it since now cpos may be moved to
+ * another leaf block.
+ */
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &tmp_rec, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ orig_rec = &rf_list->rl_recs[index];
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have calculated out how many new records we need and store
+ * in recs_need, so spare enough space first by moving the records
+ * after "index" to the end.
+ */
+ if (index != le16_to_cpu(rf_list->rl_used) - 1)
+ memmove(&rf_list->rl_recs[index + 1 + recs_need],
+ &rf_list->rl_recs[index + 1],
+ (le16_to_cpu(rf_list->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ len = (le64_to_cpu(orig_rec->r_cpos) +
+ le32_to_cpu(orig_rec->r_clusters)) -
+ (le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters));
+
+ /*
+ * If we have "len", the we will split in the tail and move it
+ * to the end of the space we have just spared.
+ */
+ if (len) {
+ tail_rec = &rf_list->rl_recs[index + recs_need];
+
+ memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+ le64_add_cpu(&tail_rec->r_cpos,
+ le32_to_cpu(tail_rec->r_clusters) - len);
+ tail_rec->r_clusters = le32_to_cpu(len);
+ }
+
+ /*
+ * If the split pos isn't the same as the original one, we need to
+ * split in the head.
+ *
+ * Note: We have the chance that split_rec.r_refcount = 0,
+ * recs_need = 0 and len > 0, which means we just cut the head from
+ * the orig_rec and in that case we have done some modification in
+ * orig_rec above, so the check for r_cpos is faked.
+ */
+ if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+ len = le64_to_cpu(split_rec->r_cpos) -
+ le64_to_cpu(orig_rec->r_cpos);
+ orig_rec->r_clusters = cpu_to_le32(len);
+ index++;
+ }
+
+ le16_add_cpu(&rf_list->rl_used, recs_need);
+
+ if (split_rec->r_refcount) {
+ rf_list->rl_recs[index] = *split_rec;
+ mlog(0, "insert refcount record start %llu, len %u, count %u "
+ "to leaf block %llu at index %d\n",
+ (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount),
+ (unsigned long long)ref_leaf_bh->b_blocknr, index);
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+ }
+
+ ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0, index;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_refcount_rec rec;
+ unsigned int set_len = 0;
+
+ mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ set_len = le32_to_cpu(rec.r_clusters);
+
+ /*
+ * Here we may meet with 3 situations:
+ *
+ * 1. If we find an already existing record, and the length
+ * is the same, cool, we just need to increase the r_refcount
+ * and it is OK.
+ * 2. If we find a hole, just insert it with r_refcount = 1.
+ * 3. If we are in the middle of one extent record, split
+ * it.
+ */
+ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+ set_len <= len) {
+ mlog(0, "increase refcount rec, start %llu, len %u, "
+ "count %u\n", (unsigned long long)cpos, set_len,
+ le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index,
+ merge, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (!rec.r_refcount) {
+ rec.r_refcount = cpu_to_le32(1);
+
+ mlog(0, "insert refcount rec, start %llu, len %u\n",
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len);
+ ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh,
+ &rec, index,
+ merge, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ set_len = min((u64)(cpos + len),
+ le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+ rec.r_cpos = cpu_to_le64(cpos);
+ rec.r_clusters = cpu_to_le32(set_len);
+ le32_add_cpu(&rec.r_refcount, 1);
+
+ mlog(0, "split refcount rec, start %llu, "
+ "len %u, count %u\n",
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len, le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &rec, index, merge,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += set_len;
+ len -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_extent_tree et;
+
+ BUG_ON(rb->rf_records.rl_used);
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+ 1, meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+ /*
+ * add the freed block to the dealloc so that it will be freed
+ * when we run dealloc.
+ */
+ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot),
+ le64_to_cpu(rb->rf_blkno),
+ le16_to_cpu(rb->rf_suballoc_bit));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ le32_add_cpu(&rb->rf_clusters, -1);
+
+ /*
+ * check whether we need to restore the root refcount block if
+ * there is no leaf extent block at atll.
+ */
+ if (!rb->rf_list.l_next_free_rec) {
+ BUG_ON(rb->rf_clusters);
+
+ mlog(0, "reset refcount tree root %llu to be a record block.\n",
+ (unsigned long long)ref_root_bh->b_blocknr);
+
+ rb->rf_flags = 0;
+ rb->rf_parent = 0;
+ rb->rf_cpos = 0;
+ memset(&rb->rf_records, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records));
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ }
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+ return ret;
+}
+
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+ cpos, len, 1,
+ meta_ac, dealloc);
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ int index, u64 cpos, unsigned int len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+ BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+ BUG_ON(cpos + len >
+ le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+ if (cpos == le64_to_cpu(rec->r_cpos) &&
+ len == le32_to_cpu(rec->r_clusters))
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index, 1, -1);
+ else {
+ struct ocfs2_refcount_rec split = *rec;
+ split.r_cpos = cpu_to_le64(cpos);
+ split.r_clusters = cpu_to_le32(len);
+
+ le32_add_cpu(&split.r_refcount, -1);
+
+ mlog(0, "split refcount rec, start %llu, "
+ "len %u, count %u, original start %llu, len %u\n",
+ (unsigned long long)le64_to_cpu(split.r_cpos),
+ len, le32_to_cpu(split.r_refcount),
+ (unsigned long long)le64_to_cpu(rec->r_cpos),
+ le32_to_cpu(rec->r_clusters));
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &split, index, 1,
+ meta_ac, dealloc);
+ }
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Remove the leaf refcount block if it contains no refcount record. */
+ if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+ ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret = 0, index = 0;
+ struct ocfs2_refcount_rec rec;
+ unsigned int r_count = 0, r_len;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ mlog(0, "Tree owner %llu, decrease refcount start %llu, "
+ "len %u, delete %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len, delete);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ r_count = le32_to_cpu(rec.r_refcount);
+ BUG_ON(r_count == 0);
+ if (!delete)
+ BUG_ON(r_count > 1);
+
+ r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+
+ ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh, index,
+ cpos, r_len,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
+ ret = ocfs2_cache_cluster_dealloc(dealloc,
+ ocfs2_clusters_to_blocks(sb, cpos),
+ r_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += r_len;
+ len -= r_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret;
+ u64 ref_blkno;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+ cpos, len, meta_ac, dealloc, delete);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos,
+ u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
+ inode->i_ino, cpos, len, phys);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ OCFS2_EXT_REFCOUNTED, 0);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 start_cpos,
+ u32 clusters,
+ int *meta_add,
+ int *credits)
+{
+ int ret = 0, index, ref_blocks = 0, recs_add = 0;
+ u64 cpos = start_cpos;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+ u32 len;
+
+ mlog(0, "start_cpos %llu, clusters %u\n",
+ (unsigned long long)start_cpos, clusters);
+ while (clusters) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, clusters, &rec,
+ &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ref_leaf_bh != prev_bh) {
+ /*
+ * Now we encounter a new leaf block, so calculate
+ * whether we need to extend the old leaf.
+ */
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)
+ prev_bh->b_data;
+
+ if (le64_to_cpu(rb->rf_records.rl_used) +
+ recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+ }
+
+ recs_add = 0;
+ *credits += 1;
+ brelse(prev_bh);
+ prev_bh = ref_leaf_bh;
+ get_bh(prev_bh);
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
+ "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
+ recs_add, (unsigned long long)cpos, clusters,
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ le32_to_cpu(rec.r_clusters),
+ le32_to_cpu(rec.r_refcount), index);
+
+ len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+ /*
+ * If the refcount rec already exist, cool. We just need
+ * to check whether there is a split. Otherwise we just need
+ * to increase the refcount.
+ * If we will insert one, increases recs_add.
+ *
+ * We record all the records which will be inserted to the
+ * same refcount block, so that we can tell exactly whether
+ * we need a new refcount block or not.
+ */
+ if (rec.r_refcount) {
+ /* Check whether we need a split at the beginning. */
+ if (cpos == start_cpos &&
+ cpos != le64_to_cpu(rec.r_cpos))
+ recs_add++;
+
+ /* Check whether we need a split in the end. */
+ if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters))
+ recs_add++;
+ } else
+ recs_add++;
+
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ clusters -= len;
+ cpos += len;
+ }
+
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+ if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ }
+
+ if (!ref_blocks)
+ goto out;
+
+ mlog(0, "we need ref_blocks %d\n", ref_blocks);
+ *meta_add += ref_blocks;
+ *credits += ref_blocks;
+
+ /*
+ * So we may need ref_blocks to insert into the tree.
+ * That also means we need to change the b-tree and add that number
+ * of records since we never merge them.
+ * We need one more block for expansion since the new created leaf
+ * block is also full and needs split.
+ */
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+ *credits += ocfs2_calc_extend_credits(sb,
+ et.et_root_el,
+ ref_blocks);
+ } else {
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ *meta_add += 1;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ brelse(prev_bh);
+ return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * continguous also, so that we can get the number easily.
+ * As for meta_ac, we will at most add split 2 refcount record and
+ * 2 more refcount block, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret, ref_blocks = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+ u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc), &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci,
+ le64_to_cpu(di->i_refcount_loc),
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ &tree->rf_ci,
+ ref_root_bh,
+ start_cpos, clusters,
+ &ref_blocks, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "reserve new metadata %d, credits = %d\n",
+ ref_blocks, *credits);
+
+ if (ref_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ref_blocks, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+#define MAX_CONTIG_BYTES 1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+ return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+ return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+ unsigned int start,
+ unsigned int cpos)
+{
+ BUG_ON(start > cpos);
+
+ return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+ unsigned int len)
+{
+ unsigned int padded =
+ (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+ ocfs2_cow_contig_mask(sb);
+
+ /* Did we wrap? */
+ if (padded < len)
+ padded = UINT_MAX;
+
+ return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ u32 cpos,
+ u32 write_len,
+ u32 max_cpos,
+ u32 *cow_start,
+ u32 *cow_len)
+{
+ int ret = 0;
+ int tree_height = le16_to_cpu(el->l_tree_depth), i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb = NULL;
+ struct ocfs2_extent_rec *rec;
+ unsigned int want_clusters, rec_end = 0;
+ int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+ int leaf_clusters;
+
+ BUG_ON(cpos + write_len > max_cpos);
+
+ if (tree_height > 0) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ *cow_len = 0;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ if (ocfs2_is_empty_extent(rec)) {
+ mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+ "index %d\n", inode->i_ino, i);
+ continue;
+ }
+
+ if (le32_to_cpu(rec->e_cpos) +
+ le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+ continue;
+
+ if (*cow_len == 0) {
+ /*
+ * We should find a refcounted record in the
+ * first pass.
+ */
+ BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+ *cow_start = le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * If we encounter a hole, a non-refcounted record or
+ * pass the max_cpos, stop the search.
+ */
+ if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+ (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
+ (max_cpos <= le32_to_cpu(rec->e_cpos)))
+ break;
+
+ leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+ if (rec_end > max_cpos) {
+ rec_end = max_cpos;
+ leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * How many clusters do we actually need from
+ * this extent? First we see how many we actually
+ * need to complete the write. If that's smaller
+ * than contig_clusters, we try for contig_clusters.
+ */
+ if (!*cow_len)
+ want_clusters = write_len;
+ else
+ want_clusters = (cpos + write_len) -
+ (*cow_start + *cow_len);
+ if (want_clusters < contig_clusters)
+ want_clusters = contig_clusters;
+
+ /*
+ * If the write does not cover the whole extent, we
+ * need to calculate how we're going to split the extent.
+ * We try to do it on contig_clusters boundaries.
+ *
+ * Any extent smaller than contig_clusters will be
+ * CoWed in its entirety.
+ */
+ if (leaf_clusters <= contig_clusters)
+ *cow_len += leaf_clusters;
+ else if (*cow_len || (*cow_start == cpos)) {
+ /*
+ * This extent needs to be CoW'd from its
+ * beginning, so all we have to do is compute
+ * how many clusters to grab. We align
+ * want_clusters to the edge of contig_clusters
+ * to get better I/O.
+ */
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+
+ if (leaf_clusters < want_clusters)
+ *cow_len += leaf_clusters;
+ else
+ *cow_len += want_clusters;
+ } else if ((*cow_start + contig_clusters) >=
+ (cpos + write_len)) {
+ /*
+ * Breaking off contig_clusters at the front
+ * of the extent will cover our write. That's
+ * easy.
+ */
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= contig_clusters) {
+ /*
+ * Breaking off contig_clusters at the tail of
+ * this extent will cover cpos.
+ */
+ *cow_start = rec_end - contig_clusters;
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= want_clusters) {
+ /*
+ * While we can't fit the entire write in this
+ * extent, we know that the write goes from cpos
+ * to the end of the extent. Break that off.
+ * We try to break it at some multiple of
+ * contig_clusters from the front of the extent.
+ * Failing that (ie, cpos is within
+ * contig_clusters of the front), we'll CoW the
+ * entire extent.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+ *cow_len = rec_end - *cow_start;
+ } else {
+ /*
+ * Ok, the entire write lives in the middle of
+ * this extent. Let's try to slice the extent up
+ * nicely. Optimally, our CoW region starts at
+ * m*contig_clusters from the beginning of the
+ * extent and goes for n*contig_clusters,
+ * covering the entire write.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+
+ want_clusters = (cpos + write_len) - *cow_start;
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+ if (*cow_start + want_clusters <= rec_end)
+ *cow_len = want_clusters;
+ else
+ *cow_len = rec_end - *cow_start;
+ }
+
+ /* Have we covered our entire write yet? */
+ if ((*cow_start + *cow_len) >= (cpos + write_len))
+ break;
+
+ /*
+ * If we reach the end of the extent block and don't get enough
+ * clusters, continue with the next extent block if possible.
+ */
+ if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+ eb && eb->h_next_leaf_blk) {
+ brelse(eb_bh);
+ eb_bh = NULL;
+
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ i = -1;
+ }
+ }
+
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ * more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ * just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+ u32 p_cluster, u32 num_clusters,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int *credits)
+{
+ int ret = 0, meta_add = 0;
+ int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < num_clusters + 2)
+ meta_add =
+ ocfs2_extend_meta_needed(et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
+ num_clusters + 2);
+
+ ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
+ meta_add, num_clusters, *credits);
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+ data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+ BUG_ON(buffer_dirty(bh));
+
+ clear_buffer_mapped(bh);
+
+ return 0;
+}
+
+static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0, partial;
+ struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct page *page;
+ pgoff_t page_index;
+ unsigned int from, to;
+ loff_t offset, end, map_end;
+ struct address_space *mapping = context->inode->i_mapping;
+
+ mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
+ new_cluster, new_len, cpos);
+
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ /* from, to is the offset within the page. */
+ from = offset & (PAGE_CACHE_SIZE - 1);
+ to = PAGE_CACHE_SIZE;
+ if (map_end & (PAGE_CACHE_SIZE - 1))
+ to = map_end & (PAGE_CACHE_SIZE - 1);
+
+ page = grab_cache_page(mapping, page_index);
+
+ /* This page can't be dirtied before we CoW it out. */
+ BUG_ON(PageDirty(page));
+
+ if (!PageUptodate(page)) {
+ ret = block_read_full_page(page, ocfs2_get_block);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ lock_page(page);
+ }
+
+ if (page_has_buffers(page)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_clear_cow_buffer);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ocfs2_map_and_dirty_page(context->inode,
+ handle, from, to,
+ page, 0, &new_block);
+ mark_page_accessed(page);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0;
+ struct super_block *sb = context->inode->i_sb;
+ struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+ u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct buffer_head *old_bh = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
+ new_cluster, new_len);
+
+ for (i = 0; i < blocks; i++, old_block++, new_block++) {
+ new_bh = sb_getblk(osb->sb, new_block);
+ if (new_bh == NULL) {
+ ret = -EIO;
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_journal_access(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+ ret = ocfs2_journal_dirty(handle, new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ brelse(new_bh);
+ brelse(old_bh);
+ new_bh = NULL;
+ old_bh = NULL;
+ }
+
+ brelse(new_bh);
+ brelse(old_bh);
+ return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 p_cluster, u32 len,
+ unsigned int ext_flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ struct ocfs2_extent_rec replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+ mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
+ (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+ p_cluster));
+ replace_rec.e_flags = ext_flags;
+ replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, et, path, index,
+ &replace_rec, meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old,
+ u32 new, u32 len,
+ unsigned int ext_flags)
+{
+ int ret;
+ struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ u64 ino = ocfs2_metadata_cache_owner(ci);
+
+ mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
+ (unsigned long long)ino, cpos, old, new, len, ext_flags);
+
+ /*If the old clusters is unwritten, no need to duplicate. */
+ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = context->cow_duplicate_clusters(handle, context, cpos,
+ old, new, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
+ cpos, new, len, ext_flags,
+ context->meta_ac, &context->dealloc);
+ if (ret)
+ mlog_errno(ret);
+out:
+ return ret;
+}
+
+static int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 num_clusters)
+{
+ int ret = 0;
+ loff_t offset, end, map_end;
+ pgoff_t page_index;
+ struct page *page;
+
+ if (ocfs2_should_order_data(context->inode))
+ return 0;
+
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+ ret = filemap_fdatawrite_range(context->inode->i_mapping,
+ offset, end - 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ page = grab_cache_page(context->inode->i_mapping, page_index);
+ BUG_ON(!page);
+
+ wait_on_page_writeback(page);
+ if (PageError(page)) {
+ ret = -EIO;
+ mlog_errno(ret);
+ } else
+ mark_page_accessed(page);
+
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 p_cluster,
+ u32 num_clusters, unsigned int e_flags)
+{
+ int ret, delete, index, credits = 0;
+ u32 new_bit, new_len;
+ unsigned int set_len;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ handle_t *handle;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
+ struct ocfs2_refcount_rec rec;
+
+ mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
+ cpos, p_cluster, num_clusters, e_flags);
+
+ ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+ &context->data_et,
+ ref_ci,
+ context->ref_root_bh,
+ &context->meta_ac,
+ &context->data_ac, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (context->post_refcount)
+ credits += context->post_refcount->credits;
+
+ credits += context->extra_credits;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ BUG_ON(!rec.r_refcount);
+ set_len = min((u64)p_cluster + num_clusters,
+ le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+ /*
+ * There are many different situation here.
+ * 1. If refcount == 1, remove the flag and don't COW.
+ * 2. If refcount > 1, allocate clusters.
+ * Here we may not allocate r_len once at a time, so continue
+ * until we reach num_clusters.
+ */
+ if (le32_to_cpu(rec.r_refcount) == 1) {
+ delete = 0;
+ ret = ocfs2_clear_ext_refcount(handle,
+ &context->data_et,
+ cpos, p_cluster,
+ set_len, e_flags,
+ context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ } else {
+ delete = 1;
+
+ ret = __ocfs2_claim_clusters(osb, handle,
+ context->data_ac,
+ 1, set_len,
+ &new_bit, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_replace_clusters(handle, context,
+ cpos, p_cluster, new_bit,
+ new_len, e_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ set_len = new_len;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, ref_ci,
+ context->ref_root_bh,
+ p_cluster, set_len,
+ context->meta_ac,
+ &context->dealloc, delete);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cpos += set_len;
+ p_cluster += set_len;
+ num_clusters -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+ /* handle any post_cow action. */
+ if (context->post_refcount && context->post_refcount->func) {
+ ret = context->post_refcount->func(context->inode, handle,
+ context->post_refcount->para);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ if (context->get_clusters == ocfs2_di_get_clusters) {
+ ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+ brelse(ref_leaf_bh);
+
+ return ret;
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
+{
+ int ret = 0;
+ struct inode *inode = context->inode;
+ u32 cow_start = context->cow_start, cow_len = context->cow_len;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ return -EROFS;
+ }
+
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ while (cow_len) {
+ ret = context->get_clusters(context, cow_start, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+ if (cow_len < num_clusters)
+ num_clusters = cow_len;
+
+ ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+ cow_start, p_cluster,
+ num_clusters, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ cow_len -= num_clusters;
+ cow_start += num_clusters;
+ }
+
+ if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+ }
+
+ return ret;
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters. Don't CoW
+ * past max_cpos. This will stop when it runs into a hole or an
+ * unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret;
+ u32 cow_start = 0, cow_len = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_cow_context *context = NULL;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+ cpos, write_len, max_cpos,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
+ "cow_len %u\n", inode->i_ino,
+ cpos, write_len, cow_start, cow_len);
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+ context->get_clusters = ocfs2_di_get_clusters;
+
+ ocfs2_init_dinode_extent_tree(&context->data_et,
+ INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * truncate the extent map here since no matter whether we meet with
+ * any error during the action, we shouldn't trust cached extent map
+ * any more.
+ */
+ ocfs2_extent_map_trunc(inode, cow_start);
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * Don't CoW past max_cpos. If this returns successfully, all
+ * clusters between cpos and cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ while (write_len) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ if (write_len < num_clusters)
+ num_clusters = write_len;
+
+ if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+ ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+ num_clusters, max_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ write_len -= num_clusters;
+ cpos += num_clusters;
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ struct inode *inode = context->inode;
+ struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+ return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+ num_clusters, &xv->xr_list,
+ extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits)
+{
+ int ret = 0, index, ref_blocks = 0;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos += num_clusters;
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!rec.r_refcount);
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ /*
+ * We really don't know whether the other clusters is in
+ * this refcount block or not, so just take the worst
+ * case that all the clusters are in this block and each
+ * one will split a refcount rec, so totally we need
+ * clusters * 2 new refcount rec.
+ */
+ if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+
+ if (num_clusters <= le32_to_cpu(rec.r_clusters))
+ break;
+ else
+ num_clusters -= le32_to_cpu(rec.r_clusters);
+ p_cluster += num_clusters;
+ }
+ }
+
+ *meta_add += ref_blocks;
+ if (!ref_blocks)
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ else {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+ *credits += ocfs2_calc_extend_credits(inode->i_sb,
+ et.et_root_el,
+ ref_blocks);
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_cow_context *context = NULL;
+ u32 cow_start, cow_len;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+ cpos, write_len, UINT_MAX,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;;
+ context->cow_object = xv;
+
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+ /* We need the extra credits for duplicate_clusters by jbd. */
+ context->extra_credits =
+ ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+ context->get_clusters = ocfs2_xattr_value_get_clusters;
+ context->post_refcount = post;
+
+ ocfs2_init_xattr_value_extent_tree(&context->data_et,
+ INODE_CACHE(inode), vb);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 1, ref_blocks = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &ref_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "reserve new metadata %d, credits = %d\n",
+ ref_blocks, credits);
+
+ if (ref_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ref_blocks, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (post)
+ credits += post->credits;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+ cpos, num_clusters, p_cluster,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters, 0,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (post && post->func) {
+ ret = post->func(inode, handle, post->para);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_change_ctime(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret, data_changed = 0;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_tree *ref_tree;
+ unsigned int ext_flags;
+ loff_t size;
+ u32 cpos, num_clusters, clusters, p_cluster;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree di_et;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+ ret = ocfs2_create_refcount_tree(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ BUG_ON(!di->i_refcount_loc);
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(di->i_refcount_loc), 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto attach_xattr;
+
+ ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+ size = i_size_read(inode);
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+
+ if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(inode, &di_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, cpos,
+ p_cluster, num_clusters,
+ &dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ data_changed = 1;
+ }
+ cpos += num_clusters;
+ }
+
+attach_xattr:
+ if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ if (data_changed) {
+ ret = ocfs2_change_ctime(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+unlock:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+out:
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode, 0);
+
+ return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ unsigned int ext_flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+ p_cluster, num_clusters,
+ et, ref_ci,
+ ref_root_bh, &meta_ac,
+ NULL, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_insert_extent(handle, et, cpos,
+ cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+ p_cluster)),
+ num_clusters, ext_flags, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+ BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+ memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+ le16_to_cpu(s_di->id2.i_data.id_count));
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+ t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters, clusters, cpos;
+ loff_t size;
+ unsigned int ext_flags;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+ size = i_size_read(s_inode);
+ clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+
+ if (p_cluster) {
+ ret = ocfs2_add_refcounted_extent(t_inode, &et,
+ ref_ci, ref_root_bh,
+ cpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += num_clusters;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+ loff_t size = i_size_read(s_inode);
+
+ handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+ OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+ OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+ i_size_write(t_inode, size);
+
+ di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+ di->i_clusters = s_di->i_clusters;
+ di->i_size = s_di->i_size;
+ di->i_dyn_features = s_di->i_dyn_features;
+ di->i_attr = s_di->i_attr;
+
+ if (preserve) {
+ di->i_uid = s_di->i_uid;
+ di->i_gid = s_di->i_gid;
+ di->i_mode = s_di->i_mode;
+
+ /*
+ * update time.
+ * we want mtime to appear identical to the source and
+ * update ctime.
+ */
+ t_inode->i_ctime = CURRENT_TIME;
+
+ di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+ t_inode->i_mtime = s_inode->i_mtime;
+ di->i_mtime = s_di->i_mtime;
+ di->i_mtime_nsec = s_di->i_mtime_nsec;
+ }
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+ return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(di->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+ t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+ &ref_tree->rf_ci, ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ bool preserve)
+{
+ int ret;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *new_bh = NULL;
+
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_attach_refcount_tree(inode, old_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&new_inode->i_mutex);
+ ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_create_reflink_node(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_reflink_xattrs(inode, old_bh,
+ new_inode, new_bh,
+ preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+ }
+
+ ret = ocfs2_complete_reflink(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret)
+ mlog_errno(ret);
+
+inode_unlock:
+ ocfs2_inode_unlock(new_inode, 1);
+ brelse(new_bh);
+out_unlock:
+ mutex_unlock(&new_inode->i_mutex);
+out:
+ if (!ret) {
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret)
+ mlog_errno(ret);
+ }
+ return ret;
+}
+
+static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ int error;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *old_bh = NULL;
+ struct inode *new_orphan_inode = NULL;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ &new_orphan_inode);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_inode_lock(inode, &old_bh, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ error = __ocfs2_reflink(old_dentry, old_bh,
+ new_orphan_inode, preserve);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(old_bh);
+
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ /* If the security isn't preserved, we need to re-initialize them. */
+ if (!preserve) {
+ error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+ if (error)
+ mlog_errno(error);
+ }
+out:
+ if (!error) {
+ error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ new_dentry);
+ if (error)
+ mlog_errno(error);
+ }
+
+ if (new_orphan_inode) {
+ /*
+ * We need to open_unlock the inode no matter whether we
+ * succeed or not, so that other nodes can delete it later.
+ */
+ ocfs2_open_unlock(new_orphan_inode);
+ if (error)
+ iput(new_orphan_inode);
+ }
+
+ return error;
+}
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink(). This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+ if (child->d_inode)
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/* copied from user_path_parent. */
+static int ocfs2_user_path_parent(const char __user *path,
+ struct nameidata *nd, char **name)
+{
+ char *s = getname(path);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ error = path_lookup(s, LOOKUP_PARENT, nd);
+ if (error)
+ putname(s);
+ else
+ *name = s;
+
+ return error;
+}
+
+/**
+ * ocfs2_vfs_reflink - Create a reference-counted link
+ *
+ * @old_dentry: source dentry + inode
+ * @dir: directory to create the target
+ * @new_dentry: target dentry
+ * @preserve: if true, preserve all file attributes
+ */
+int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int error;
+
+ if (!inode)
+ return -ENOENT;
+
+ error = ocfs2_may_create(dir, new_dentry);
+ if (error)
+ return error;
+
+ if (dir->i_sb != inode->i_sb)
+ return -EXDEV;
+
+ /*
+ * A reflink to an append-only or immutable file cannot be created.
+ */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /* Only regular files can be reflinked. */
+ if (!S_ISREG(inode->i_mode))
+ return -EPERM;
+
+ /*
+ * If the caller wants to preserve ownership, they require the
+ * rights to do so.
+ */
+ if (preserve) {
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ }
+
+ /*
+ * If the caller is modifying any aspect of the attributes, they
+ * are not creating a snapshot. They need read permission on the
+ * file.
+ */
+ if (!preserve) {
+ error = inode_permission(inode, MAY_READ);
+ if (error)
+ return error;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ vfs_dq_init(dir);
+ error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+ mutex_unlock(&inode->i_mutex);
+ if (!error)
+ fsnotify_create(dir, new_dentry);
+ return error;
+}
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve)
+{
+ struct dentry *new_dentry;
+ struct nameidata nd;
+ struct path old_path;
+ int error;
+ char *to = NULL;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+ if (error) {
+ mlog_errno(error);
+ return error;
+ }
+
+ error = ocfs2_user_path_parent(newname, &nd, &to);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = -EXDEV;
+ if (old_path.mnt != nd.path.mnt)
+ goto out_release;
+ new_dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry)) {
+ mlog_errno(error);
+ goto out_unlock;
+ }
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error) {
+ mlog_errno(error);
+ goto out_dput;
+ }
+
+ error = ocfs2_vfs_reflink(old_path.dentry,
+ nd.path.dentry->d_inode,
+ new_dentry, preserve);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(new_dentry);
+out_unlock:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_release:
+ path_put(&nd.path);
+ putname(to);
+out:
+ path_put(&old_path);
+
+ return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 000000000000..c1d19b1d3ecc
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,106 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+ struct rb_node rf_node;
+ u64 rf_blkno;
+ u32 rf_generation;
+ struct rw_semaphore rf_sem;
+ struct ocfs2_lock_res rf_lockres;
+ struct kref rf_getcnt;
+ int rf_removed;
+
+ /* the following 4 fields are used by caching_info. */
+ struct ocfs2_caching_info rf_ci;
+ spinlock_t rf_lock;
+ struct mutex rf_io_mutex;
+ struct super_block *rf_sb;
+};
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **tree,
+ struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree,
+ int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ struct ocfs2_alloc_context **meta_ac);
+int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+ handle_t *handle,
+ void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+ int credits; /* credits it need for journal. */
+ ocfs2_post_refcount_func *func; /* real function. */
+ void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post);
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve);
+#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 424adaa5f900..3c3d673a4d20 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
new_clusters, first_new_cluster);
- ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
}
/* update the inode accordingly. */
- ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_unlock;
}
- ocfs2_set_new_buffer_uptodate(inode, group_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
@@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
- ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
@@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_commit;
}
- ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
+ main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 40661e7824e9..bfbd7e9e949f 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
- ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
- OCFS2_BH_IGNORE_CACHE, NULL);
+ ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
+ si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
- status = ocfs2_write_block(osb, bh, si->si_inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
mlog_errno(status);
@@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
(unsigned long long)blkno);
bh = NULL; /* Acquire a fresh bh */
- status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE, NULL);
+ status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
+ 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 73a16d4666dc..c30b644d9572 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+ rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
ocfs2_validate_group_descriptor);
if (rc)
goto out;
@@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
}
status = ocfs2_journal_access_gd(handle,
- alloc_inode,
+ INODE_CACHE(alloc_inode),
bg_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
@@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
status = ocfs2_block_group_fill(handle,
alloc_inode,
@@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- status = ocfs2_journal_access_di(handle, alloc_inode,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
status = ocfs2_journal_access_gd(handle,
- alloc_inode,
+ INODE_CACHE(alloc_inode),
group_bh,
journal_type);
if (status < 0) {
@@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
- status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ prev_bg_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
/* Ok, claim our bits now: set the info on dinode, chainlist
* and then the group */
status = ocfs2_journal_access_di(handle,
- alloc_inode,
+ INODE_CACHE(alloc_inode),
ac->ac_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
@@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ group_bh, journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode,
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
- num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+ num_free_extents = ocfs2_num_free_extents(osb, et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..26069917a9f5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/statfs.h>
@@ -69,6 +68,7 @@
#include "ver.h"
#include "xattr.h"
#include "quota.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -100,6 +100,8 @@ struct mount_options
static int ocfs2_parse_options(struct super_block *sb, char *options,
struct mount_options *mopt,
int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options);
static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
@@ -373,7 +375,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
.open = ocfs2_osb_debug_open,
.release = ocfs2_debug_release,
.read = ocfs2_debug_read,
@@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
lock_kernel();
- if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+ if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+ !ocfs2_check_set_options(sb, &parsed_options)) {
ret = -EINVAL;
goto out;
}
@@ -691,8 +694,6 @@ unlock_osb:
if (!ret) {
/* Only save off the new mount options in case of a successful
* remount. */
- if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
- parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
@@ -701,6 +702,10 @@ unlock_osb:
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+ MS_POSIXACL : 0);
}
out:
unlock_kernel();
@@ -773,18 +778,20 @@ static int ocfs2_sb_probe(struct super_block *sb,
if (tmpstat < 0) {
status = tmpstat;
mlog_errno(status);
- goto bail;
+ break;
}
di = (struct ocfs2_dinode *) (*bh)->b_data;
memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
spin_lock_init(&stats->b_lock);
- status = ocfs2_verify_volume(di, *bh, blksize, stats);
- if (status >= 0)
- goto bail;
- brelse(*bh);
- *bh = NULL;
- if (status != -EAGAIN)
+ tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+ if (tmpstat < 0) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ if (tmpstat != -EAGAIN) {
+ status = tmpstat;
break;
+ }
}
bail:
@@ -965,7 +972,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
-static struct quotactl_ops ocfs2_quotactl_ops = {
+static const struct quotactl_ops ocfs2_quotactl_ops = {
.quota_on = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1009,31 +1016,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
bh = NULL;
- if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
- parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
-
+ if (!ocfs2_check_set_options(sb, &parsed_options)) {
+ status = -EINVAL;
+ goto read_super_error;
+ }
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
osb->local_alloc_bits = osb->local_alloc_default_bits;
- if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
- !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- status = -EINVAL;
- mlog(ML_ERROR, "User quotas were requested, but this "
- "filesystem does not have the feature enabled.\n");
- goto read_super_error;
- }
- if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
- !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- status = -EINVAL;
- mlog(ML_ERROR, "Group quotas were requested, but this "
- "filesystem does not have the feature enabled.\n");
- goto read_super_error;
- }
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -1243,6 +1235,40 @@ static struct file_system_type ocfs2_fs_type = {
.next = NULL
};
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options)
+{
+ if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ mlog(ML_ERROR, "User quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ mlog(ML_ERROR, "Group quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+ mlog(ML_ERROR, "ACL support requested but extended attributes "
+ "feature is not enabled\n");
+ return 0;
+ }
+ /* No ACL setting specified? Use XATTR feature... */
+ if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+ OCFS2_MOUNT_NO_POSIX_ACL))) {
+ if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+ options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ else
+ options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+ }
+ return 1;
+}
+
static int ocfs2_parse_options(struct super_block *sb,
char *options,
struct mount_options *mopt,
@@ -1390,40 +1416,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_INODE64;
break;
case Opt_usrquota:
- /* We check only on remount, otherwise features
- * aren't yet initialized. */
- if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- mlog(ML_ERROR, "User quota requested but "
- "filesystem feature is not set\n");
- status = 0;
- goto bail;
- }
mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
break;
case Opt_grpquota:
- if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- mlog(ML_ERROR, "Group quota requested but "
- "filesystem feature is not set\n");
- status = 0;
- goto bail;
- }
mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
break;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
case Opt_acl:
mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
break;
case Opt_noacl:
+ mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
break;
-#else
- case Opt_acl:
- case Opt_noacl:
- printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
- break;
-#endif
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1500,12 +1505,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_INODE64)
seq_printf(s, ",inode64");
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
if (opts & OCFS2_MOUNT_POSIX_ACL)
seq_printf(s, ",acl");
else
seq_printf(s, ",noacl");
-#endif
return 0;
}
@@ -1645,6 +1648,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = numbits;
buf->f_ffree = freebits;
+ buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+ & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+ OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
brelse(bh);
@@ -1668,8 +1675,6 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
oi->ip_dir_start_lookup = 0;
init_rwsem(&oi->ip_alloc_sem);
@@ -1683,7 +1688,8 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- ocfs2_metadata_cache_init(&oi->vfs_inode);
+ ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
+ &ocfs2_inode_caching_ops);
inode_init_once(&oi->vfs_inode);
}
@@ -1859,6 +1865,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_sync_blockdev(sb);
+ ocfs2_purge_refcount_trees(osb);
+
/* No cluster connection means we've failed during mount, so skip
* all the steps which depended on that to complete. */
if (osb->cconn) {
@@ -2065,6 +2073,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ osb->osb_rf_lock_tree = RB_ROOT;
+
osb->s_feature_compat =
le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
osb->s_feature_ro_compat =
@@ -2490,7 +2500,8 @@ void __ocfs2_abort(struct super_block* sb,
/* Force a panic(). This stinks, but it's better than letting
* things continue without having a proper hard readonly
* here. */
- OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+ if (!ocfs2_mount_local(OCFS2_SB(sb)))
+ OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
ocfs2_handle_error(sb);
}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/utsname.h>
#include <linux/namei.h>
#define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 187b99ff0368..c61369342a27 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,11 +53,6 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-#endif
#define MLOG_MASK_PREFIX ML_UPTODATE
@@ -75,15 +70,77 @@ struct ocfs2_meta_cache_item {
static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
-void ocfs2_metadata_cache_init(struct inode *inode)
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+ BUG_ON(!ci || !ci->ci_ops);
- oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+ return ci->ci_ops->co_owner(ci);
+}
+
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ return ci->ci_ops->co_get_super(ci);
+}
+
+static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_lock(ci);
+}
+
+static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_unlock(ci);
+}
+
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_io_lock(ci);
+}
+
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_io_unlock(ci);
+}
+
+
+static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
+ int clear)
+{
+ ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
ci->ci_num_cached = 0;
+
+ if (clear) {
+ ci->ci_created_trans = 0;
+ ci->ci_last_trans = 0;
+ }
+}
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops)
+{
+ BUG_ON(!ops);
+
+ ci->ci_ops = ops;
+ ocfs2_metadata_cache_reset(ci, 1);
}
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
+{
+ ocfs2_metadata_cache_purge(ci);
+ ocfs2_metadata_cache_reset(ci, 1);
+}
+
+
/* No lock taken here as 'root' is not expected to be visible to other
* processes. */
static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -112,19 +169,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
* This function is a few more lines longer than necessary due to some
* accounting done here, but I think it's worth tracking down those
* bugs sooner -- Mark */
-void ocfs2_metadata_cache_purge(struct inode *inode)
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
unsigned int tree, to_purge, purged;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct rb_root root = RB_ROOT;
- spin_lock(&oi->ip_lock);
- tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ocfs2_metadata_cache_lock(ci);
+ tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
to_purge = ci->ci_num_cached;
- mlog(0, "Purge %u %s items from Inode %llu\n", to_purge,
- tree ? "array" : "tree", (unsigned long long)oi->ip_blkno);
+ mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
+ tree ? "array" : "tree",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* If we're a tree, save off the root so that we can safely
* initialize the cache. We do the work to free tree members
@@ -132,16 +190,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
if (tree)
root = ci->ci_cache.ci_tree;
- ocfs2_metadata_cache_init(inode);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_reset(ci, 0);
+ ocfs2_metadata_cache_unlock(ci);
purged = ocfs2_purge_copied_metadata_tree(&root);
/* If possible, track the number wiped so that we can more
* easily detect counting errors. Unfortunately, this is only
* meaningful for trees. */
if (tree && purged != to_purge)
- mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n",
- (unsigned long long)oi->ip_blkno, to_purge, purged);
+ mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, purged);
}
/* Returns the index in the cache array, -1 if not found.
@@ -182,27 +241,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
return NULL;
}
-static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int index = -1;
struct ocfs2_meta_cache_item *item = NULL;
- spin_lock(&oi->ip_lock);
+ ocfs2_metadata_cache_lock(ci);
- mlog(0, "Inode %llu, query block %llu (inline = %u)\n",
- (unsigned long long)oi->ip_blkno,
+ mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long) bh->b_blocknr,
- !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+ !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
- index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
+ index = ocfs2_search_cache_array(ci, bh->b_blocknr);
else
- item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
mlog(0, "index = %d, item = %p\n", index, item);
@@ -214,7 +271,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
*
* This can be called under lock_buffer()
*/
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
/* Doesn't matter if the bh is in our cache or not -- if it's
@@ -230,24 +287,24 @@ int ocfs2_buffer_uptodate(struct inode *inode,
/* Ok, locally the buffer is marked as up to date, now search
* our cache to see if we can trust that. */
- return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return ocfs2_buffer_cached(ci, bh);
}
-/*
+/*
* Determine whether a buffer is currently out on a read-ahead request.
- * ip_io_sem should be held to serialize submitters with the logic here.
+ * ci_io_sem should be held to serialize submitters with the logic here.
*/
-int ocfs2_buffer_read_ahead(struct inode *inode,
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
}
/* Requires ip_lock */
static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
sector_t block)
{
- BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
ci->ci_num_cached);
@@ -292,66 +349,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached++;
}
-static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
- struct ocfs2_caching_info *ci)
+/* co_cache_lock() must be held */
+static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
{
- assert_spin_locked(&oi->ip_lock);
-
- return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
- (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+ return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
+ (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
}
-/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
* pointers in tree after we use them - this allows caller to detect
- * when to free in case of error. */
-static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+ * when to free in case of error.
+ *
+ * The co_cache_lock() must be held. */
+static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item **tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
- "Inode %llu, num cached = %u, should be %u\n",
- (unsigned long long)oi->ip_blkno, ci->ci_num_cached,
- OCFS2_INODE_MAX_CACHE_ARRAY);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
- "Inode %llu not marked as inline anymore!\n",
- (unsigned long long)oi->ip_blkno);
- assert_spin_locked(&oi->ip_lock);
+ mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
+ "Owner %llu, num cached = %u, should be %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
+ mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
+ "Owner %llu not marked as inline anymore!\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* Be careful to initialize the tree members *first* because
* once the ci_tree is used, the array is junk... */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
tree[i]->c_block = ci->ci_cache.ci_array[i];
- oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+ ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
ci->ci_cache.ci_tree = RB_ROOT;
/* this will be set again by __ocfs2_insert_cache_tree */
ci->ci_num_cached = 0;
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
__ocfs2_insert_cache_tree(ci, tree[i]);
tree[i] = NULL;
}
mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
- (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_flags, ci->ci_num_cached);
}
/* Slow path function - memory allocation is necessary. See the
* comment above ocfs2_set_buffer_uptodate for more information. */
-static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
sector_t block,
int expand_tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct ocfs2_meta_cache_item *new = NULL;
- struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+ struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
{ NULL, };
- mlog(0, "Inode %llu, block %llu, expand = %d\n",
- (unsigned long long)oi->ip_blkno,
+ mlog(0, "Owner %llu, block %llu, expand = %d\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)block, expand_tree);
new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
@@ -364,7 +419,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
if (expand_tree) {
/* Do *not* allocate an array here - the removal code
* has no way of tracking that. */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
GFP_NOFS);
if (!tree[i]) {
@@ -376,21 +431,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
}
}
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
mlog(0, "Someone cleared the tree underneath us\n");
/* Ok, items were removed from the cache in between
* locks. Detect this and revert back to the fast path */
ocfs2_append_cache_array(ci, block);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
goto out_free;
}
if (expand_tree)
- ocfs2_expand_cache(oi, tree);
+ ocfs2_expand_cache(ci, tree);
__ocfs2_insert_cache_tree(ci, new);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
new = NULL;
out_free:
@@ -400,14 +455,14 @@ out_free:
/* If these were used, then ocfs2_expand_cache re-set them to
* NULL for us. */
if (tree[0]) {
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
if (tree[i])
kmem_cache_free(ocfs2_uptodate_cachep,
tree[i]);
}
}
-/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
+/* Item insertion is guarded by co_io_lock(), so the insertion path takes
* advantage of this by not rechecking for a duplicate insert during
* the slow case. Additionally, if the cache needs to be bumped up to
* a tree, the code will not recheck after acquiring the lock --
@@ -425,59 +480,55 @@ out_free:
* Readahead buffers can be passed in here before the I/O request is
* completed.
*/
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int expand;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
/* The block may very well exist in our cache already, so avoid
* doing any more work in that case. */
- if (ocfs2_buffer_cached(oi, bh))
+ if (ocfs2_buffer_cached(ci, bh))
return;
- mlog(0, "Inode %llu, inserting block %llu\n",
- (unsigned long long)oi->ip_blkno,
+ mlog(0, "Owner %llu, inserting block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr);
/* No need to recheck under spinlock - insertion is guarded by
- * ip_io_mutex */
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
+ * co_io_lock() */
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
/* Fast case - it's an array and there's a free
* spot. */
ocfs2_append_cache_array(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
return;
}
expand = 0;
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
/* We need to bump things up to a tree. */
expand = 1;
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
- __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+ __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
}
/* Called against a newly allocated buffer. Most likely nobody should
* be able to read this sort of metadata while it's still being
- * allocated, but this is careful to take ip_io_mutex anyway. */
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+ * allocated, but this is careful to take co_io_lock() anyway. */
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
/* This should definitely *not* exist in our cache */
- BUG_ON(ocfs2_buffer_cached(oi, bh));
+ BUG_ON(ocfs2_buffer_cached(ci, bh));
set_buffer_uptodate(bh);
- mutex_lock(&oi->ip_io_mutex);
- ocfs2_set_buffer_uptodate(inode, bh);
- mutex_unlock(&oi->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
+ ocfs2_set_buffer_uptodate(ci, bh);
+ ocfs2_metadata_cache_io_unlock(ci);
}
/* Requires ip_lock. */
@@ -487,7 +538,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
sector_t *array = ci->ci_cache.ci_array;
int bytes;
- BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
BUG_ON(index >= ci->ci_num_cached);
BUG_ON(!ci->ci_num_cached);
@@ -515,21 +566,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached--;
}
-static void ocfs2_remove_block_from_cache(struct inode *inode,
+static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
sector_t block)
{
int index;
struct ocfs2_meta_cache_item *item = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- spin_lock(&oi->ip_lock);
- mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n",
- (unsigned long long)oi->ip_blkno,
+ ocfs2_metadata_cache_lock(ci);
+ mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long) block, ci->ci_num_cached,
- oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ ci->ci_flags & OCFS2_CACHE_FL_INLINE);
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
index = ocfs2_search_cache_array(ci, block);
if (index != -1)
ocfs2_remove_metadata_array(ci, index);
@@ -538,7 +587,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
if (item)
ocfs2_remove_metadata_tree(ci, item);
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
if (item)
kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -549,23 +598,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
* bother reverting things to an inlined array in the case of a remove
* which moves us back under the limit.
*/
-void ocfs2_remove_from_cache(struct inode *inode,
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
sector_t block = bh->b_blocknr;
- ocfs2_remove_block_from_cache(inode, block);
+ ocfs2_remove_block_from_cache(ci, block);
}
/* Called when we remove xattr clusters from an inode. */
-void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
sector_t block,
u32 c_len)
{
- unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
for (i = 0; i < b_len; i++, block++)
- ocfs2_remove_block_from_cache(inode, block);
+ ocfs2_remove_block_from_cache(ci, block);
}
int __init init_ocfs2_uptodate_cache(void)
@@ -577,7 +627,7 @@ int __init init_ocfs2_uptodate_cache(void)
return -ENOMEM;
mlog(0, "%u inlined cache items per inode.\n",
- OCFS2_INODE_MAX_CACHE_ARRAY);
+ OCFS2_CACHE_INFO_MAX_ARRAY);
return 0;
}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 531b4b3a0c47..0d826fe2da0d 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,24 +26,59 @@
#ifndef OCFS2_UPTODATE_H
#define OCFS2_UPTODATE_H
+/*
+ * The caching code relies on locking provided by the user of
+ * struct ocfs2_caching_info. These operations connect that up.
+ */
+struct ocfs2_caching_operations {
+ /*
+ * A u64 representing the owning structure. Usually this
+ * is the block number (i_blkno or whatnot). This is used so
+ * that caching log messages can identify the owning structure.
+ */
+ u64 (*co_owner)(struct ocfs2_caching_info *ci);
+
+ /* The superblock is needed during I/O. */
+ struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
+ /*
+ * Lock and unlock the caching data. These will not sleep, and
+ * should probably be spinlocks.
+ */
+ void (*co_cache_lock)(struct ocfs2_caching_info *ci);
+ void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
+
+ /*
+ * Lock and unlock for disk I/O. These will sleep, and should
+ * be mutexes.
+ */
+ void (*co_io_lock)(struct ocfs2_caching_info *ci);
+ void (*co_io_unlock)(struct ocfs2_caching_info *ci);
+};
+
int __init init_ocfs2_uptodate_cache(void);
void exit_ocfs2_uptodate_cache(void);
-void ocfs2_metadata_cache_init(struct inode *inode);
-void ocfs2_metadata_cache_purge(struct inode *inode);
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops);
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_remove_from_cache(struct inode *inode,
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
sector_t block,
u32 c_len);
-int ocfs2_buffer_read_ahead(struct inode *inode,
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1a27cda984f..923e950e5445 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,8 @@
#include "buffer_head_io.h"
#include "super.h"
#include "xattr.h"
-
+#include "refcounttree.h"
+#include "acl.h"
struct ocfs2_xattr_def_value_root {
struct ocfs2_xattr_value_root xv;
@@ -97,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
&ocfs2_xattr_acl_access_handler,
&ocfs2_xattr_acl_default_handler,
-#endif
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
@@ -108,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
= &ocfs2_xattr_acl_access_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
= &ocfs2_xattr_acl_default_handler,
-#endif
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
[OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
@@ -140,7 +137,7 @@ struct ocfs2_xattr_search {
int not_found;
};
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
struct ocfs2_xattr_header *xh,
int index,
int *block_off,
@@ -157,7 +154,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
struct ocfs2_xattr_search *xs);
static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
- struct ocfs2_xattr_tree_root *xt,
+ struct buffer_head *blk_bh,
char *buffer,
size_t buffer_size);
@@ -170,12 +167,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_search *xs,
struct ocfs2_xattr_set_ctxt *ctxt);
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
- struct buffer_head *xb_bh);
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *root_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para);
+
static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
u64 src_blk, u64 last_blk, u64 to_blk,
unsigned int start_bucket,
u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_need,
+ int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh);
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
{
@@ -254,9 +281,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
break;
}
- if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+ if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
bucket->bu_bhs[i]))
- ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
bucket->bu_bhs[i]);
}
@@ -271,7 +298,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
{
int rc;
- rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+ rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
bucket->bu_blocks, bucket->bu_bhs, 0,
NULL);
if (!rc) {
@@ -297,7 +324,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
int i, rc = 0;
for (i = 0; i < bucket->bu_blocks; i++) {
- rc = ocfs2_journal_access(handle, bucket->bu_inode,
+ rc = ocfs2_journal_access(handle,
+ INODE_CACHE(bucket->bu_inode),
bucket->bu_bhs[i], type);
if (rc) {
mlog_errno(rc);
@@ -399,7 +427,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+ rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
ocfs2_validate_xattr_block);
/* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -596,15 +624,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
int status = 0;
handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
struct ocfs2_extent_tree et;
mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
- ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- status = vb->vb_access(handle, inode, vb->vb_bh,
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -612,13 +639,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
}
prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
- status = ocfs2_add_clusters_in_btree(osb,
- inode,
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
&logical_start,
clusters_to_add,
0,
- &et,
- handle,
ctxt->data_ac,
ctxt->meta_ac,
&why);
@@ -649,6 +674,7 @@ leave:
static int __ocfs2_remove_xattr_range(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
u32 cpos, u32 phys_cpos, u32 len,
+ unsigned int ext_flags,
struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
@@ -656,16 +682,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
handle_t *handle = ctxt->handle;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- ret = vb->vb_access(handle, inode, vb->vb_bh,
+ ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
&ctxt->dealloc);
if (ret) {
mlog_errno(ret);
@@ -680,7 +706,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
goto out;
}
- ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(inode->i_sb,
+ phys_blkno),
+ len, ctxt->meta_ac, &ctxt->dealloc, 1);
+ else
+ ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+ phys_blkno, len);
if (ret)
mlog_errno(ret);
@@ -695,6 +728,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret = 0;
+ unsigned int ext_flags;
u32 trunc_len, cpos, phys_cpos, alloc_size;
u64 block;
@@ -706,7 +740,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
while (trunc_len) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
&alloc_size,
- &vb->vb_xv->xr_list);
+ &vb->vb_xv->xr_list, &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
@@ -717,15 +751,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
phys_cpos, alloc_size,
- ctxt);
+ ext_flags, ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- ocfs2_remove_xattr_clusters_from_cache(inode, block,
- alloc_size);
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
+ block, alloc_size);
cpos += alloc_size;
trunc_len -= alloc_size;
}
@@ -810,6 +844,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
return result;
}
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_xattr_header *xh;
+ int i;
+
+ xh = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+ if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+ return 1;
+
+ return 0;
+}
+
static int ocfs2_xattr_ibody_list(struct inode *inode,
struct ocfs2_dinode *di,
char *buffer,
@@ -855,11 +906,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
ret = ocfs2_xattr_list_entries(inode, header,
buffer, buffer_size);
- } else {
- struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
- ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+ } else
+ ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
buffer, buffer_size);
- }
brelse(blk_bh);
@@ -961,7 +1010,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
cpos = 0;
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
- &num_clusters, el);
+ &num_clusters, el, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -970,7 +1019,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
/* Copy ocfs2_xattr_value */
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh, NULL);
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1085,7 +1135,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
i = xs->here - xs->header->xh_entries;
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(xs->bucket),
i,
&block_off,
@@ -1183,7 +1233,7 @@ static int ocfs2_xattr_get(struct inode *inode,
static int __ocfs2_xattr_set_value_outside(struct inode *inode,
handle_t *handle,
- struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_xattr_value_buf *vb,
const void *value,
int value_len)
{
@@ -1194,28 +1244,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
u64 blkno;
struct buffer_head *bh = NULL;
+ unsigned int ext_flags;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
- &num_clusters, &xv->xr_list);
+ &num_clusters, &xv->xr_list,
+ &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh, NULL);
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access(handle,
- inode,
+ INODE_CACHE(inode),
bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
@@ -1266,7 +1322,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
void *val = xs->base + offs;
size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- ret = vb->vb_access(handle, inode, vb->vb_bh,
+ ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1294,7 +1350,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
{
int ret;
- ret = vb->vb_access(handle, inode, vb->vb_bh,
+ ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1355,7 +1411,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
mlog_errno(ret);
return ret;
}
- ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+ ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
xi->value, xi->value_len);
if (ret < 0)
mlog_errno(ret);
@@ -1594,7 +1650,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
ret = __ocfs2_xattr_set_value_outside(inode,
handle,
- vb.vb_xv,
+ &vb,
xi->value,
xi->value_len);
if (ret < 0)
@@ -1615,7 +1671,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
}
}
- ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1623,7 +1679,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
}
if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- ret = vb.vb_access(handle, inode, vb.vb_bh,
+ ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1700,51 +1756,112 @@ out:
return ret;
}
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ int *ref_credits)
+{
+ int ret, meta_add = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ *ref_credits = 0;
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+ ref_root_bh, xv,
+ &meta_add, ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
static int ocfs2_remove_value_outside(struct inode*inode,
struct ocfs2_xattr_value_buf *vb,
- struct ocfs2_xattr_header *header)
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
- int ret = 0, i;
+ int ret = 0, i, ref_credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ void *val;
ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
- ctxt.handle = ocfs2_start_trans(osb,
- ocfs2_remove_extent_credits(osb->sb));
- if (IS_ERR(ctxt.handle)) {
- ret = PTR_ERR(ctxt.handle);
- mlog_errno(ret);
- goto out;
- }
-
for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
- if (!ocfs2_xattr_is_local(entry)) {
- void *val;
+ if (ocfs2_xattr_is_local(entry))
+ continue;
- val = (void *)header +
- le16_to_cpu(entry->xe_name_offset);
- vb->vb_xv = (struct ocfs2_xattr_value_root *)
- (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
- ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
+ val = (void *)header +
+ le16_to_cpu(entry->xe_name_offset);
+ vb->vb_xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+ ref_ci, ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+ ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
}
}
- ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &ctxt.dealloc);
-out:
return ret;
}
static int ocfs2_xattr_ibody_remove(struct inode *inode,
- struct buffer_head *di_bh)
+ struct buffer_head *di_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1759,13 +1876,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
((void *)di + inode->i_sb->s_blocksize -
le16_to_cpu(di->i_xattr_inline_size));
- ret = ocfs2_remove_value_outside(inode, &vb, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
return ret;
}
+struct ocfs2_rm_xattr_bucket_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+};
+
static int ocfs2_xattr_block_remove(struct inode *inode,
- struct buffer_head *blk_bh)
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct ocfs2_xattr_block *xb;
int ret = 0;
@@ -1773,19 +1898,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
.vb_bh = blk_bh,
.vb_access = ocfs2_journal_access_xb,
};
+ struct ocfs2_rm_xattr_bucket_para args = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ };
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
- ret = ocfs2_remove_value_outside(inode, &vb, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
} else
- ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+ ret = ocfs2_iterate_xattr_index_block(inode,
+ blk_bh,
+ ocfs2_rm_xattr_cluster,
+ &args);
return ret;
}
static int ocfs2_xattr_free_block(struct inode *inode,
- u64 block)
+ u64 block,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct inode *xb_alloc_inode;
struct buffer_head *xb_alloc_bh = NULL;
@@ -1803,7 +1938,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
goto out;
}
- ret = ocfs2_xattr_block_remove(inode, blk_bh);
+ ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1863,6 +1998,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = NULL;
handle_t *handle;
int ret;
@@ -1872,8 +2010,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return 0;
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ref_ci = &ref_tree->rf_ci;
+
+ }
+
if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
- ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+ ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+ ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1882,7 +2033,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (di->i_xattr_loc) {
ret = ocfs2_xattr_free_block(inode,
- le64_to_cpu(di->i_xattr_loc));
+ le64_to_cpu(di->i_xattr_loc),
+ ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1896,7 +2048,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1916,6 +2068,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+ brelse(ref_root_bh);
return ret;
}
@@ -2083,6 +2238,84 @@ cleanup:
return ret;
}
+static int ocfs2_create_xattr_block(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *inode_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_bh,
+ int indexed)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_xattr_block *xblk;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+ new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ /* Initialize ocfs2_xattr_block */
+ xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+ memset(xblk, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+ xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+ xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+ if (indexed) {
+ struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(
+ ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+ xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+ }
+
+ ret = ocfs2_journal_dirty(handle, new_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+ di->i_xattr_loc = cpu_to_le64(first_blkno);
+ ocfs2_journal_dirty(handle, inode_bh);
+
+ *ret_bh = new_bh;
+ new_bh = NULL;
+
+end:
+ brelse(new_bh);
+ return ret;
+}
+
/*
* ocfs2_xattr_block_set()
*
@@ -2095,63 +2328,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
struct ocfs2_xattr_set_ctxt *ctxt)
{
struct buffer_head *new_bh = NULL;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
handle_t *handle = ctxt->handle;
struct ocfs2_xattr_block *xblk = NULL;
- u16 suballoc_bit_start;
- u32 num_got;
- u64 first_blkno;
int ret;
if (!xs->xattr_bh) {
- ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
-
- ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
- &suballoc_bit_start, &num_got,
- &first_blkno);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
-
- new_bh = sb_getblk(inode->i_sb, first_blkno);
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
-
- ret = ocfs2_journal_access_xb(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
+ ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
+ ctxt->meta_ac, &new_bh, 0);
+ if (ret) {
mlog_errno(ret);
goto end;
}
- /* Initialize ocfs2_xattr_block */
xs->xattr_bh = new_bh;
- xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
- memset(xblk, 0, inode->i_sb->s_blocksize);
- strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
- xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
- xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
- xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
- xblk->xb_blkno = cpu_to_le64(first_blkno);
-
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
xs->header = &xblk->xb_attrs.xb_header;
xs->base = (void *)xs->header;
xs->end = (void *)xblk + inode->i_sb->s_blocksize;
xs->here = xs->header->xh_entries;
-
- ret = ocfs2_journal_dirty(handle, new_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
- di->i_xattr_loc = cpu_to_le64(first_blkno);
- ocfs2_journal_dirty(handle, xs->inode_bh);
} else
xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
@@ -2273,7 +2467,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
old_in_xb = 1;
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(xbs->bucket),
i, &block_off,
&name_offset);
@@ -2428,6 +2622,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
struct ocfs2_xattr_search *xis,
struct ocfs2_xattr_search *xbs,
struct ocfs2_xattr_set_ctxt *ctxt,
+ int extra_meta,
int *credits)
{
int clusters_add, meta_add, ret;
@@ -2444,6 +2639,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
return ret;
}
+ meta_add += extra_meta;
mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
"credits = %d\n", xi->name, meta_add, clusters_add, *credits);
@@ -2598,7 +2794,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
if (!ret) {
/* Update inode ctime. */
- ret = ocfs2_journal_access_di(ctxt->handle, inode,
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
xis->inode_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2711,10 +2907,11 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret, credits;
+ int ret, credits, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ struct ocfs2_refcount_tree *ref_tree = NULL;
struct ocfs2_xattr_info xi = {
.name_index = name_index,
@@ -2779,6 +2976,17 @@ int ocfs2_xattr_set(struct inode *inode,
goto cleanup;
}
+ /* Check whether the value is refcounted and do some prepartion. */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+ (!xis.not_found || !xbs.not_found)) {
+ ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+ &xis, &xbs, &ref_tree,
+ &ref_meta, &ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
mutex_lock(&tl_inode->i_mutex);
@@ -2793,7 +3001,7 @@ int ocfs2_xattr_set(struct inode *inode,
mutex_unlock(&tl_inode->i_mutex);
ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
- &xbs, &ctxt, &credits);
+ &xbs, &ctxt, ref_meta, &credits);
if (ret) {
mlog_errno(ret);
goto cleanup;
@@ -2801,7 +3009,7 @@ int ocfs2_xattr_set(struct inode *inode,
/* we need to update inode's ctime field, so add credit for it. */
credits += OCFS2_INODE_UPDATE_CREDITS;
- ctxt.handle = ocfs2_start_trans(osb, credits);
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
if (IS_ERR(ctxt.handle)) {
ret = PTR_ERR(ctxt.handle);
mlog_errno(ret);
@@ -2819,8 +3027,16 @@ int ocfs2_xattr_set(struct inode *inode,
if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
cleanup:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ if (!value && !ret) {
+ ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
ocfs2_inode_unlock(inode, 1);
cleanup_nolock:
brelse(di_bh);
@@ -2849,7 +3065,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
u64 e_blkno = 0;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2931,7 +3148,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
if (cmp)
continue;
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
xh,
i,
&block_off,
@@ -3175,7 +3392,7 @@ struct ocfs2_xattr_tree_list {
size_t result;
};
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
struct ocfs2_xattr_header *xh,
int index,
int *block_off,
@@ -3188,8 +3405,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
- *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
- *new_offset = name_offset % inode->i_sb->s_blocksize;
+ *block_off = name_offset >> sb->s_blocksize_bits;
+ *new_offset = name_offset % sb->s_blocksize;
return 0;
}
@@ -3209,7 +3426,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
prefix = ocfs2_xattr_prefix(type);
if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(bucket),
i,
&block_off,
@@ -3232,22 +3449,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
- struct ocfs2_xattr_tree_root *xt,
- char *buffer,
- size_t buffer_size)
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para)
{
- struct ocfs2_extent_list *el = &xt->xt_list;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
int ret = 0;
u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
u64 p_blkno = 0;
- struct ocfs2_xattr_tree_list xl = {
- .buffer = buffer,
- .buffer_size = buffer_size,
- .result = 0,
- };
- if (le16_to_cpu(el->l_next_free_rec) == 0)
+ if (!el->l_next_free_rec || !rec_func)
return 0;
while (name_hash > 0) {
@@ -3255,16 +3469,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
&e_cpos, &num_clusters, el);
if (ret) {
mlog_errno(ret);
- goto out;
+ break;
}
- ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
- ocfs2_list_xattr_bucket,
- &xl);
+ ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+ num_clusters, para);
if (ret) {
if (ret != -ERANGE)
mlog_errno(ret);
- goto out;
+ break;
}
if (e_cpos == 0)
@@ -3273,6 +3486,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
name_hash = e_cpos - 1;
}
+ return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ char *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_xattr_tree_list xl = {
+ .buffer = buffer,
+ .buffer_size = buffer_size,
+ .result = 0,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_list_xattr_tree_rec, &xl);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = xl.result;
out:
return ret;
@@ -3426,7 +3670,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
down_write(&oi->ip_alloc_sem);
- ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4263,9 +4507,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
prev_cpos, (unsigned long long)bucket_blkno(first));
- ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
- ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -4319,7 +4563,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
num_bits, (unsigned long long)block, v_start);
- ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+ ret = ocfs2_insert_extent(handle, &et, v_start, block,
num_bits, 0, ctxt->meta_ac);
if (ret < 0) {
mlog_errno(ret);
@@ -4798,10 +5042,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
struct ocfs2_xattr_entry *xe = xs->here;
struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
void *base;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
- ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
xe - xh->xh_entries,
&block_off,
&offset);
@@ -4814,8 +5061,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
xv = (struct ocfs2_xattr_value_root *)(base + offset +
OCFS2_XATTR_SIZE(xe->xe_name_len));
+ vb.vb_xv = xv;
+ vb.vb_bh = xs->bucket->bu_bhs[block_off];
ret = __ocfs2_xattr_set_value_outside(inode, handle,
- xv, val, value_len);
+ &vb, val, value_len);
if (ret)
mlog_errno(ret);
out:
@@ -4826,7 +5075,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
struct buffer_head *root_bh,
u64 blkno,
u32 cpos,
- u32 len)
+ u32 len,
+ void *para)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4838,14 +5088,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+ ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_delete_xattr_in_bucket, para);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
cpos, len, (unsigned long long)blkno);
- ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
+ len);
ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
if (ret) {
@@ -4870,14 +5128,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
&dealloc);
if (ret) {
mlog_errno(ret);
@@ -5220,7 +5478,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
struct ocfs2_xattr_bucket *bucket,
void *para)
{
- int ret = 0;
+ int ret = 0, ref_credits;
struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u16 i;
struct ocfs2_xattr_entry *xe;
@@ -5228,7 +5486,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
int credits = ocfs2_remove_extent_credits(osb->sb) +
ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_rm_xattr_bucket_para *args =
+ (struct ocfs2_rm_xattr_bucket_para *)para;
ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
@@ -5237,7 +5497,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
if (ocfs2_xattr_is_local(xe))
continue;
- ctxt.handle = ocfs2_start_trans(osb, credits);
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+ i, &xv, NULL);
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+ args->ref_ci,
+ args->ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
if (IS_ERR(ctxt.handle)) {
ret = PTR_ERR(ctxt.handle);
mlog_errno(ret);
@@ -5248,57 +5517,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
i, 0, &ctxt);
ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
+ }
if (ret) {
mlog_errno(ret);
break;
}
}
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &ctxt.dealloc);
return ret;
}
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
- struct buffer_head *xb_bh)
+/*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+ handle_t *handle,
+ void *para)
+{
+ int ret;
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+ return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ * will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ * lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_add,
+ int *credits)
{
- struct ocfs2_xattr_block *xb =
- (struct ocfs2_xattr_block *)xb_bh->b_data;
- struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
int ret = 0;
- u32 name_hash = UINT_MAX, e_cpos, num_clusters;
- u64 p_blkno;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_entry *xe;
+ char *base;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ int name_offset, name_len;
+ struct ocfs2_xattr_value_buf vb;
+ struct ocfs2_xattr_bucket *bucket = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_post_refcount refcount;
+ struct ocfs2_post_refcount *p = NULL;
+ struct buffer_head *ref_root_bh = NULL;
- if (le16_to_cpu(el->l_next_free_rec) == 0)
- return 0;
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ vb.vb_bh = xis->inode_bh;
+ vb.vb_access = ocfs2_journal_access_di;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
- while (name_hash > 0) {
- ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
- &e_cpos, &num_clusters, el);
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ base = bucket_block(xbs->bucket, block_off);
+ vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+ vb.vb_access = ocfs2_journal_access;
+
+ if (ocfs2_meta_ecc(osb)) {
+ /*create parameters for ocfs2_post_refcount. */
+ bucket = xbs->bucket;
+ refcount.credits = bucket->bu_blocks;
+ refcount.para = bucket;
+ refcount.func =
+ ocfs2_xattr_bucket_post_refcount;
+ p = &refcount;
+ }
+ } else {
+ base = xbs->base;
+ vb.vb_bh = xbs->xattr_bh;
+ vb.vb_access = ocfs2_journal_access_xb;
+ }
+ }
+
+ if (ocfs2_xattr_is_local(xe))
+ goto out;
+
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters, &vb.vb_xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We just need to check the 1st extent record, since we always
+ * CoW the whole xattr. So there shouldn't be a xattr with
+ * some REFCOUNT extent recs after the 1st one.
+ */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If we are deleting the xattr or the new size will be stored inside,
+ * cool, leave it there, the xattr truncate process will remove them
+ * for us(it still needs the refcount tree lock and the meta, credits).
+ * And the worse case is that every cluster truncate will split the
+ * refcount tree, and make the original extent become 3. So we will need
+ * 2 * cluster more extent recs at most.
+ */
+ if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+ ret = ocfs2_refcounted_xattr_delete_need(inode,
+ &(*ref_tree)->rf_ci,
+ ref_root_bh, vb.vb_xv,
+ meta_add, credits);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+ *ref_tree, ref_root_bh, 0,
+ le32_to_cpu(vb.vb_xv->xr_clusters), p);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_extent_tree *value_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *refcount)
+{
+ int ret = 0;
+ u32 clusters = le32_to_cpu(xv->xr_clusters);
+ u32 cpos, p_cluster, num_clusters;
+ struct ocfs2_extent_list *el = &xv->xr_list;
+ unsigned int ext_flags;
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el, &ext_flags);
+
+ cpos += num_clusters;
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+ continue;
+
+ BUG_ON(!p_cluster);
+
+ ret = ocfs2_add_refcount_flag(inode, value_et,
+ ref_ci, ref_root_bh,
+ cpos - num_clusters,
+ p_cluster, num_clusters,
+ dealloc, refcount);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_extent_tree et;
+ int i, ret = 0;
+
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ xe = &header->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ xv = (struct ocfs2_xattr_value_root *)((void *)header +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ vb->vb_xv = xv;
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+ ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+ ref_ci, ref_root_bh,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+ (fe_bh->b_data + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = fe_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh)
+{
+ int ret, block_off, name_offset;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+ void *base;
+
+ ret = ocfs2_xattr_bucket_get_name_value(sb,
+ bucket_xh(bucket),
+ offset,
+ &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ base = bucket_block(bucket, block_off);
+
+ *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (bh)
+ *bh = bucket->bu_bhs[block_off];
+out:
+ return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_xattr_tree_value_refcount_para *ref =
+ (struct ocfs2_xattr_tree_value_refcount_para *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+ struct ocfs2_post_refcount refcount = {
+ .credits = bucket->bu_blocks,
+ .para = bucket,
+ .func = ocfs2_xattr_bucket_post_refcount,
+ };
+ struct ocfs2_post_refcount *p = NULL;
+
+ /* We only need post_refcount if we support metaecc. */
+ if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+ p = &refcount;
+
+ mlog(0, "refcount bucket %llu, count = %u\n",
+ (unsigned long long)bucket_blkno(bucket),
+ le16_to_cpu(xh->xh_count));
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+ &vb.vb_xv, &vb.vb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_init_xattr_value_extent_tree(&et,
+ INODE_CACHE(inode), &vb);
+
+ ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+ &et, ref->ref_ci,
+ ref->ref_root_bh,
+ ref->dealloc, p);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_xattr_bucket_value_refcount,
+ para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh,
+ dealloc);
+ } else {
+ struct ocfs2_xattr_tree_value_refcount_para para = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ .dealloc = dealloc,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_refcount_xattr_tree_rec,
+ &para);
+ }
+
+ return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+ ref_ci, ref_root_bh,
+ dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
+ }
+
+ if (!di->i_xattr_loc)
+ goto out;
+
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+ ref_root_bh, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+out:
+
+ return ret;
+}
+
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
+/*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+ struct inode *old_inode;
+ struct inode *new_inode;
+ struct buffer_head *old_bh;
+ struct buffer_head *new_bh;
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+ should_xattr_reflinked *xattr_reflinked;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int *metas, int *credits,
+ int *num_recs,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe;
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+ le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+ *credits += ocfs2_calc_extend_credits(sb,
+ &def_xv.xv.xr_list,
+ le32_to_cpu(xv->xr_clusters));
+
+ /*
+ * If the value is a tree with depth > 1, We don't go deep
+ * to the extent block, so just calculate a maximum record num.
+ */
+ if (!xv->xr_list.l_tree_depth)
+ *num_recs += xv->xr_list.l_next_free_rec;
+ else
+ *num_recs += ocfs2_clusters_for_bytes(sb,
+ XATTR_SIZE_MAX);
+ }
+
+ return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+ *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (ret_bh)
+ *ret_bh = bh;
+
+ return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *ref_root_bh,
+ int *credits,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret, meta_add = 0, num_recs = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ *credits = 0;
+
+ ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+ &meta_add, credits, &num_recs,
+ ocfs2_get_xattr_value_root,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
+ */
+ num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ meta_add += num_recs;
+ *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
- ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
- ocfs2_delete_xattr_in_bucket,
- NULL);
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+ struct ocfs2_xattr_reflink *args,
+ struct buffer_head *old_bh,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *new_bh,
+ struct ocfs2_xattr_header *new_xh,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_alloc_context *meta_ac,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int ret = 0, i, j;
+ struct super_block *sb = args->old_inode->i_sb;
+ struct buffer_head *value_bh;
+ struct ocfs2_xattr_entry *xe, *last;
+ struct ocfs2_xattr_value_root *xv, *new_xv;
+ struct ocfs2_extent_tree data_et;
+ u32 clusters, cpos, p_cluster, num_clusters;
+ unsigned int ext_flags = 0;
+
+ mlog(0, "reflink xattr in container %llu, count = %u\n",
+ (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+
+ last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+ for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
+ xe = &xh->xh_entries[i];
+
+ if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+ xe = &new_xh->xh_entries[j];
+
+ le16_add_cpu(&new_xh->xh_count, -1);
+ if (new_xh->xh_count) {
+ memmove(xe, xe + 1,
+ (void *)last - (void *)xe);
+ memset(last, 0,
+ sizeof(struct ocfs2_xattr_entry));
+ }
+
+ /*
+ * We don't want j to increase in the next round since
+ * it is already moved ahead.
+ */
+ j--;
+ continue;
+ }
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * For the xattr which has l_tree_depth = 0, all the extent
+ * recs have already be copied to the new xh with the
+ * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+ * increase the refount count int the refcount tree.
+ *
+ * For the xattr which has l_tree_depth > 0, we need
+ * to initialize it to the empty default value root,
+ * and then insert the extents one by one.
+ */
+ if (xv->xr_list.l_tree_depth) {
+ memcpy(new_xv, &def_xv, sizeof(def_xv));
+ vb->vb_xv = new_xv;
+ vb->vb_bh = value_bh;
+ ocfs2_init_xattr_value_extent_tree(&data_et,
+ INODE_CACHE(args->new_inode), vb);
+ }
+
+ clusters = le32_to_cpu(xv->xr_clusters);
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(args->old_inode,
+ cpos,
+ &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!p_cluster);
+
+ if (xv->xr_list.l_tree_depth) {
+ ret = ocfs2_insert_extent(handle,
+ &data_et, cpos,
+ ocfs2_clusters_to_blocks(
+ args->old_inode->i_sb,
+ p_cluster),
+ num_clusters, ext_flags,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_increase_refcount(handle, args->ref_ci,
+ args->ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, args->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos += num_clusters;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+ int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+ int header_off = osb->sb->s_blocksize - inline_size;
+ struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+ (args->old_bh->b_data + header_off);
+ struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+ (args->new_bh->b_data + header_off);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_inode_info *new_oi;
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = args->new_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+ args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(args->new_bh->b_data + header_off,
+ args->old_bh->b_data + header_off, inline_size);
+
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+ args->new_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_oi = OCFS2_I(args->new_inode);
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **ret_bh,
+ int indexed)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "create new xattr block for inode %llu, index = %d\n",
+ (unsigned long long)fe_bh->b_blocknr, indexed);
+ ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
+ meta_ac, ret_bh, indexed);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+ int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_block *new_xb =
+ (struct ocfs2_xattr_block *)new_blk_bh->b_data;
+ struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = new_blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* One more credits in case we need to add xattr flags in new inode. */
+ handle = ocfs2_start_trans(osb, credits + 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ ret = ocfs2_journal_access_di(handle,
+ INODE_CACHE(args->new_inode),
+ args->new_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+ new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+ osb->sb->s_blocksize - header_off);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+ new_blk_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_journal_dirty(handle, new_blk_bh);
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+ struct ocfs2_xattr_reflink *reflink;
+ struct buffer_head *old_blk_bh;
+ struct buffer_head *new_blk_bh;
+ struct ocfs2_xattr_bucket *old_bucket;
+ struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_xattr_bucket *bucket;
+
+ if (bh == args->old_bucket->bu_bhs[0])
+ bucket = args->old_bucket;
+ else
+ bucket = args->new_bucket;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+ int num_metas;
+ int credits;
+ int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ struct ocfs2_value_tree_metas *metas =
+ (struct ocfs2_value_tree_metas *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+ /* Add the credits for this bucket first. */
+ metas->credits += bucket->bu_blocks;
+ return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+ xh, &metas->num_metas,
+ &metas->credits, &metas->num_recs,
+ ocfs2_value_tree_metas_in_bucket,
+ bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *xt_et,
+ u64 blkno, u32 len, int *credits,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac)
+{
+ int ret, num_free_extents;
+ struct ocfs2_value_tree_metas metas;
+ struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+
+ memset(&metas, 0, sizeof(metas));
+
+ ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+ ocfs2_calc_value_tree_metas, &metas);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *credits = metas.credits;
+
+ /*
+ * Calculate we need for refcount tree change.
+ *
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
+ * In the end, we have to add credits for modifying the already
+ * existed refcount block.
+ */
+ rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+ metas.num_recs =
+ (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+ ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ metas.num_metas += metas.num_recs;
+ *credits += metas.num_recs +
+ metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ /* count in the xattr tree change. */
+ num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < len)
+ metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(osb->sb,
+ xt_et->et_root_el, len);
+
+ if (metas.num_metas) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+ meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
+ }
- ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
- p_blkno, e_cpos, num_clusters);
+ if (len) {
+ ret = ocfs2_reserve_clusters(osb, len, data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ u64 blkno, u64 new_blkno, u32 clusters,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_reflink_xattr_tree_args *args)
+{
+ int i, j, ret = 0;
+ struct super_block *sb = args->reflink->old_inode->i_sb;
+ u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+ u32 num_buckets = clusters * bpc;
+ int bpb = args->old_bucket->bu_blocks;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+
+ for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
if (ret) {
mlog_errno(ret);
break;
}
- if (e_cpos == 0)
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+ if (ret) {
+ mlog_errno(ret);
break;
+ }
- name_hash = e_cpos - 1;
+ /*
+ * The real bucket num in this series of blocks is stored
+ * in the 1st bucket.
+ */
+ if (i == 0)
+ num_buckets = le16_to_cpu(
+ bucket_xh(args->old_bucket)->xh_num_buckets);
+
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ for (j = 0; j < bpb; j++)
+ memcpy(bucket_block(args->new_bucket, j),
+ bucket_block(args->old_bucket, j),
+ sb->s_blocksize);
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+ ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+ args->old_bucket->bu_bhs[0],
+ bucket_xh(args->old_bucket),
+ args->new_bucket->bu_bhs[0],
+ bucket_xh(args->new_bucket),
+ &vb, meta_ac,
+ ocfs2_get_reflink_xattr_value_root,
+ args);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * Re-access and dirty the bucket to calculate metaecc.
+ * Because we may extend the transaction in reflink_xattr_header
+ * which will let the already accessed block gone.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
+ }
+
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
+ return ret;
+}
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para)
+{
+ int ret, credits = 0;
+ u32 p_cluster, num_clusters;
+ u64 new_blkno;
+ handle_t *handle;
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_tree_extent_tree(&et,
+ INODE_CACHE(args->reflink->new_inode),
+ args->new_blk_bh);
+
+ ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+ len, &credits,
+ &meta_ac, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_clusters(osb, handle, data_ac,
+ len, &p_cluster, &num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
+
+ mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
+ (unsigned long long)blkno, (unsigned long long)new_blkno, len);
+ ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
+ meta_ac, data_ac, args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+ (unsigned long long)new_blkno, len, cpos);
+ ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
+ len, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret;
+ struct ocfs2_reflink_xattr_tree_args para;
+
+ memset(&para, 0, sizeof(para));
+ para.reflink = args;
+ para.old_blk_bh = blk_bh;
+ para.new_blk_bh = new_blk_bh;
+
+ para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+ if (!para.old_bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+ if (!para.new_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+ ocfs2_reflink_xattr_rec,
+ &para);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_xattr_bucket_free(para.old_bucket);
+ ocfs2_xattr_bucket_free(para.new_bucket);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh)
+{
+ int ret, indexed = 0;
+ struct buffer_head *new_blk_bh = NULL;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+ indexed = 1;
+
+ ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+ &new_blk_bh, indexed);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+ ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+ else
+ ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_blk_bh);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+ int type = ocfs2_xattr_get_type(xe);
+
+ return type != OCFS2_XATTR_INDEX_SECURITY &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security)
+{
+ int ret;
+ struct ocfs2_xattr_reflink args;
+ struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh = NULL;
+
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ args.old_inode = old_inode;
+ args.new_inode = new_inode;
+ args.old_bh = old_bh;
+ args.new_bh = new_bh;
+ args.ref_ci = &ref_tree->rf_ci;
+ args.ref_root_bh = ref_root_bh;
+ args.dealloc = &dealloc;
+ if (preserve_security)
+ args.xattr_reflinked = NULL;
+ else
+ args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_reflink_xattr_inline(&args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+ }
+
+ if (!di->i_xattr_loc)
+ goto out_unlock;
+
+ ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+
+out_unlock:
+ ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+ ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
}
out:
@@ -5306,6 +6957,51 @@ out:
}
/*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode)
+{
+ int ret = 0;
+ struct buffer_head *dir_bh = NULL;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+
+ ret = ocfs2_init_security_get(inode, dir, &si);
+ if (!ret) {
+ ret = ocfs2_xattr_security_set(inode, si.name,
+ si.value, si.value_len,
+ XATTR_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ } else if (ret != -EOPNOTSUPP) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_inode_unlock(dir, 0);
+ brelse(dir_bh);
+leave:
+ return ret;
+}
+/*
* 'security' attributes support
*/
static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a1b7bc..abd72a47f520 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info {
extern struct xattr_handler ocfs2_xattr_user_handler;
extern struct xattr_handler ocfs2_xattr_trusted_handler;
extern struct xattr_handler ocfs2_xattr_security_handler;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
extern struct xattr_handler ocfs2_xattr_acl_access_handler;
extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-#endif
extern struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
@@ -55,6 +53,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
int, const char *, const void *, size_t, int,
struct ocfs2_alloc_context *,
struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di);
int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
int ocfs2_init_security_get(struct inode *, struct inode *,
struct ocfs2_security_xattr_info *);
@@ -83,5 +83,16 @@ struct ocfs2_xattr_value_buf {
struct ocfs2_xattr_value_root *vb_xv;
};
-
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
return ret;
}
-struct inode_operations omfs_dir_inops = {
+const struct inode_operations omfs_dir_inops = {
.lookup = omfs_lookup,
.mkdir = omfs_mkdir,
.rename = omfs_rename,
@@ -498,7 +498,7 @@ struct inode_operations omfs_dir_inops = {
.rmdir = omfs_rmdir,
};
-struct file_operations omfs_dir_operations = {
+const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
.readdir = omfs_readdir,
.llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, omfs_get_block);
}
-struct file_operations omfs_file_operations = {
+const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations omfs_file_inops = {
+const struct inode_operations omfs_file_inops = {
.truncate = omfs_truncate
};
-struct address_space_operations omfs_aops = {
+const struct address_space_operations omfs_aops = {
.readpage = omfs_readpage,
.readpages = omfs_readpages,
.writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct super_operations omfs_sops = {
+static const struct super_operations omfs_sops = {
.write_inode = omfs_write_inode,
.delete_inode = omfs_delete_inode,
.put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,16 +44,16 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
/* dir.c */
-extern struct file_operations omfs_dir_operations;
-extern struct inode_operations omfs_dir_inops;
+extern const struct file_operations omfs_dir_operations;
+extern const struct inode_operations omfs_dir_inops;
extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
u64 fsblock);
/* file.c */
-extern struct file_operations omfs_file_operations;
-extern struct inode_operations omfs_file_inops;
-extern struct address_space_operations omfs_aops;
+extern const struct file_operations omfs_file_operations;
+extern const struct inode_operations omfs_file_inops;
+extern const struct address_space_operations omfs_aops;
extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
extern int omfs_shrink_inode(struct inode *inode);
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..4f01e06227c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
struct file *filp)
{
- int err;
+ int ret;
struct iattr newattrs;
/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
}
/* Remove suid/sgid on truncate too */
- newattrs.ia_valid |= should_remove_suid(dentry);
+ ret = should_remove_suid(dentry);
+ if (ret)
+ newattrs.ia_valid |= ret | ATTR_FORCE;
mutex_lock(&dentry->d_inode->i_mutex);
- err = notify_change(dentry, &newattrs);
+ ret = notify_change(dentry, &newattrs);
mutex_unlock(&dentry->d_inode->i_mutex);
- return err;
+ return ret;
}
static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -288,10 +290,9 @@ out:
return error;
}
-SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
+SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
- /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
- return do_sys_truncate(path, (long)length);
+ return do_sys_truncate(path, length);
}
static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
@@ -957,6 +958,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
int error;
struct file *f;
+ validate_creds(cred);
+
/*
* We must always pass in a valid mount pointer. Historically
* callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..7b685e10cbad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
part_stat_read(p, merges[WRITE]),
(unsigned long long)part_stat_read(p, sectors[WRITE]),
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
- p->in_flight,
+ part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
jiffies_to_msecs(part_stat_read(p, time_in_queue)));
}
+ssize_t part_inflight_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_stat.attr,
+ &dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
.attrs = part_attrs,
};
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
&part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
&blk_trace_attr_group,
@@ -571,7 +581,7 @@ try_scan:
}
if (from + size > get_capacity(disk)) {
- struct block_device_operations *bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
unsigned long long capacity;
printk(KERN_WARNING
diff --git a/fs/pipe.c b/fs/pipe.c
index 52c415114838..ae17d026aaa3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
static int
pipe_read_open(struct inode *inode, struct file *filp)
{
- /* We could have perhaps used atomic_t, but this and friends
- below are the only places. So it doesn't seem worthwhile. */
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- inode->i_pipe->readers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ inode->i_pipe->readers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
static int
pipe_write_open(struct inode *inode, struct file *filp)
{
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- inode->i_pipe->writers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ inode->i_pipe->writers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
static int
pipe_rdwr_open(struct inode *inode, struct file *filp)
{
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- if (filp->f_mode & FMODE_READ)
- inode->i_pipe->readers++;
- if (filp->f_mode & FMODE_WRITE)
- inode->i_pipe->writers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ if (filp->f_mode & FMODE_READ)
+ inode->i_pipe->readers++;
+ if (filp->f_mode & FMODE_WRITE)
+ inode->i_pipe->writers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
/*
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..822c2d506518 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/swapops.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -321,6 +322,94 @@ static inline void task_context_switch_counts(struct seq_file *m,
p->nivcsw);
}
+#ifdef CONFIG_MMU
+
+struct stack_stats {
+ struct vm_area_struct *vma;
+ unsigned long startpage;
+ unsigned long usage;
+};
+
+static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct stack_stats *ss = walk->private;
+ struct vm_area_struct *vma = ss->vma;
+ pte_t *pte, ptent;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+#ifdef CONFIG_STACK_GROWSUP
+ if (pte_present(ptent) || is_swap_pte(ptent))
+ ss->usage = addr - ss->startpage + PAGE_SIZE;
+#else
+ if (pte_present(ptent) || is_swap_pte(ptent)) {
+ ss->usage = ss->startpage - addr + PAGE_SIZE;
+ pte++;
+ ret = 1;
+ break;
+ }
+#endif
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return ret;
+}
+
+static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
+ struct task_struct *task)
+{
+ struct stack_stats ss;
+ struct mm_walk stack_walk = {
+ .pmd_entry = stack_usage_pte_range,
+ .mm = vma->vm_mm,
+ .private = &ss,
+ };
+
+ if (!vma->vm_mm || is_vm_hugetlb_page(vma))
+ return 0;
+
+ ss.vma = vma;
+ ss.startpage = task->stack_start & PAGE_MASK;
+ ss.usage = 0;
+
+#ifdef CONFIG_STACK_GROWSUP
+ walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
+ &stack_walk);
+#else
+ walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
+ &stack_walk);
+#endif
+ return ss.usage;
+}
+
+static inline void task_show_stack_usage(struct seq_file *m,
+ struct task_struct *task)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = get_task_mm(task);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, task->stack_start);
+ if (vma)
+ seq_printf(m, "Stack usage:\t%lu kB\n",
+ get_stack_usage_in_bytes(vma, task) >> 10);
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+}
+#else
+static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
+{
+}
+#endif /* CONFIG_MMU */
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -340,6 +429,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_show_regs(m, task);
#endif
task_context_switch_counts(m, task);
+ task_show_stack_usage(m, task);
return 0;
}
@@ -481,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
rsslim,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
- (permitted && mm) ? mm->start_stack : 0,
+ (permitted && mm) ? task->stack_start : 0,
esp,
eip,
/* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..af643b5aefe8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
do_posix_clock_monotonic_gettime(&uptime);
read_lock(&tasklist_lock);
- points = badness(task, uptime.tv_sec);
+ points = badness(task->group_leader, uptime.tv_sec);
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -458,7 +458,7 @@ struct limit_names {
};
static const struct limit_names lnames[RLIM_NLIMITS] = {
- [RLIMIT_CPU] = {"Max cpu time", "ms"},
+ [RLIMIT_CPU] = {"Max cpu time", "seconds"},
[RLIMIT_FSIZE] = {"Max file size", "bytes"},
[RLIMIT_DATA] = {"Max data size", "bytes"},
[RLIMIT_STACK] = {"Max stack size", "bytes"},
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF];
size_t len;
- int oom_adjust;
+ int oom_adjust = OOM_DISABLE;
+ unsigned long flags;
if (!task)
return -ESRCH;
- oom_adjust = task->oomkilladj;
+
+ if (lock_task_sighand(task, &flags)) {
+ oom_adjust = task->signal->oom_adj;
+ unlock_task_sighand(task, &flags);
+ }
+
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
- int oom_adjust;
+ char buffer[PROC_NUMBUF];
+ long oom_adjust;
+ unsigned long flags;
+ int err;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- oom_adjust = simple_strtol(buffer, &end, 0);
+
+ err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+ if (err)
+ return -EINVAL;
if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
oom_adjust != OOM_DISABLE)
return -EINVAL;
- if (*end == '\n')
- end++;
+
task = get_proc_task(file->f_path.dentry->d_inode);
if (!task)
return -ESRCH;
- if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+ if (!lock_task_sighand(task, &flags)) {
+ put_task_struct(task);
+ return -ESRCH;
+ }
+
+ if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
return -EACCES;
}
- task->oomkilladj = oom_adjust;
+
+ task->signal->oom_adj = oom_adjust;
+
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
+
+ return count;
}
static const struct file_operations proc_oom_adjust_operations = {
@@ -1169,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- make_it_fail = simple_strtol(buffer, &end, 0);
- if (*end == '\n')
- end++;
+ make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
+ if (*end)
+ return -EINVAL;
task = get_proc_task(file->f_dentry->d_inode);
if (!task)
return -ESRCH;
task->make_it_fail = make_it_fail;
put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
+
+ return count;
}
static const struct file_operations proc_fault_inject_operations = {
@@ -2580,15 +2597,11 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
name.len = snprintf(buf, sizeof(buf), "%d", pid);
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
- if (!(current->flags & PF_EXITING))
- shrink_dcache_parent(dentry);
+ shrink_dcache_parent(dentry);
d_drop(dentry);
dput(dentry);
}
- if (tgid == 0)
- goto out;
-
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2645,17 +2658,16 @@ out:
void proc_flush_task(struct task_struct *task)
{
int i;
- struct pid *pid, *tgid = NULL;
+ struct pid *pid, *tgid;
struct upid *upid;
pid = task_pid(task);
- if (thread_group_leader(task))
- tgid = task_tgid(task);
+ tgid = task_tgid(task);
for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid ? tgid->numbers[i].nr : 0);
+ tgid->numbers[i].nr);
}
upid = &pid->numbers[pid->level];
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..a44a7897fd4d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,9 +17,14 @@
#include <linux/elfcore.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
+#include <linux/bootmem.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <asm/io.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/memory.h>
+#include <asm/sections.h>
#define CORE_STR "CORE"
@@ -29,17 +34,6 @@
static struct proc_dir_entry *proc_root_kcore;
-static int open_kcore(struct inode * inode, struct file * filp)
-{
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
-
-static const struct file_operations proc_kcore_operations = {
- .read = read_kcore,
- .open = open_kcore,
-};
#ifndef kc_vaddr_to_offset
#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
@@ -57,18 +51,19 @@ struct memelfnote
void *data;
};
-static struct kcore_list *kclist;
+static LIST_HEAD(kclist_head);
static DEFINE_RWLOCK(kclist_lock);
+static int kcore_need_update = 1;
void
-kclist_add(struct kcore_list *new, void *addr, size_t size)
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
{
new->addr = (unsigned long)addr;
new->size = size;
+ new->type = type;
write_lock(&kclist_lock);
- new->next = kclist;
- kclist = new;
+ list_add_tail(&new->list, &kclist_head);
write_unlock(&kclist_lock);
}
@@ -80,7 +75,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
*nphdr = 1; /* PT_NOTE */
size = 0;
- for (m=kclist; m; m=m->next) {
+ list_for_each_entry(m, &kclist_head, list) {
try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > size)
size = try;
@@ -97,6 +92,177 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
return size + *elf_buflen;
}
+static void free_kclist_ents(struct list_head *head)
+{
+ struct kcore_list *tmp, *pos;
+
+ list_for_each_entry_safe(pos, tmp, head, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+}
+/*
+ * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
+ */
+static void __kcore_update_ram(struct list_head *list)
+{
+ int nphdr;
+ size_t size;
+ struct kcore_list *tmp, *pos;
+ LIST_HEAD(garbage);
+
+ write_lock(&kclist_lock);
+ if (kcore_need_update) {
+ list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+ if (pos->type == KCORE_RAM
+ || pos->type == KCORE_VMEMMAP)
+ list_move(&pos->list, &garbage);
+ }
+ list_splice_tail(list, &kclist_head);
+ } else
+ list_splice(list, &garbage);
+ kcore_need_update = 0;
+ proc_root_kcore->size = get_kcore_size(&nphdr, &size);
+ write_unlock(&kclist_lock);
+
+ free_kclist_ents(&garbage);
+}
+
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
+ * because memory hole is not as big as !HIGHMEM case.
+ * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
+ */
+static int kcore_update_ram(void)
+{
+ LIST_HEAD(head);
+ struct kcore_list *ent;
+ int ret = 0;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->addr = (unsigned long)__va(0);
+ ent->size = max_low_pfn << PAGE_SHIFT;
+ ent->type = KCORE_RAM;
+ list_add(&ent->list, &head);
+ __kcore_update_ram(&head);
+ return ret;
+}
+
+#else /* !CONFIG_HIGHMEM */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* calculate vmemmap's address from given system ram pfn and register it */
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+ unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
+ unsigned long nr_pages = ent->size >> PAGE_SHIFT;
+ unsigned long start, end;
+ struct kcore_list *vmm, *tmp;
+
+
+ start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
+ end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
+ end = ALIGN(end, PAGE_SIZE);
+ /* overlap check (because we have to align page */
+ list_for_each_entry(tmp, head, list) {
+ if (tmp->type != KCORE_VMEMMAP)
+ continue;
+ if (start < tmp->addr + tmp->size)
+ if (end > tmp->addr)
+ end = tmp->addr;
+ }
+ if (start < end) {
+ vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
+ if (!vmm)
+ return 0;
+ vmm->addr = start;
+ vmm->size = end - start;
+ vmm->type = KCORE_VMEMMAP;
+ list_add_tail(&vmm->list, head);
+ }
+ return 1;
+
+}
+#else
+int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+ return 1;
+}
+
+#endif
+
+static int
+kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+ struct list_head *head = (struct list_head *)arg;
+ struct kcore_list *ent;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+ ent->size = nr_pages << PAGE_SHIFT;
+
+ /* Sanity check: Can happen in 32bit arch...maybe */
+ if (ent->addr < (unsigned long) __va(0))
+ goto free_out;
+
+ /* cut not-mapped area. ....from ppc-32 code. */
+ if (ULONG_MAX - ent->addr < ent->size)
+ ent->size = ULONG_MAX - ent->addr;
+
+ /* cut when vmalloc() area is higher than direct-map area */
+ if (VMALLOC_START > (unsigned long)__va(0)) {
+ if (ent->addr > VMALLOC_START)
+ goto free_out;
+ if (VMALLOC_START - ent->addr < ent->size)
+ ent->size = VMALLOC_START - ent->addr;
+ }
+
+ ent->type = KCORE_RAM;
+ list_add_tail(&ent->list, head);
+
+ if (!get_sparsemem_vmemmap_info(ent, head)) {
+ list_del(&ent->list);
+ goto free_out;
+ }
+
+ return 0;
+free_out:
+ kfree(ent);
+ return 1;
+}
+
+static int kcore_update_ram(void)
+{
+ int nid, ret;
+ unsigned long end_pfn;
+ LIST_HEAD(head);
+
+ /* Not inialized....update now */
+ /* find out "max pfn" */
+ end_pfn = 0;
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long node_end;
+ node_end = NODE_DATA(nid)->node_start_pfn +
+ NODE_DATA(nid)->node_spanned_pages;
+ if (end_pfn < node_end)
+ end_pfn = node_end;
+ }
+ /* scan 0 to max_pfn */
+ ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
+ if (ret) {
+ free_kclist_ents(&head);
+ return -ENOMEM;
+ }
+ __kcore_update_ram(&head);
+ return ret;
+}
+#endif /* CONFIG_HIGHMEM */
/*****************************************************************************/
/*
@@ -192,7 +358,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
nhdr->p_align = 0;
/* setup ELF PT_LOAD program header for every area */
- for (m=kclist; m; m=m->next) {
+ list_for_each_entry(m, &kclist_head, list) {
phdr = (struct elf_phdr *) bufp;
bufp += sizeof(struct elf_phdr);
offset += sizeof(struct elf_phdr);
@@ -265,7 +431,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
unsigned long start;
read_lock(&kclist_lock);
- proc_root_kcore->size = size = get_kcore_size(&nphdr, &elf_buflen);
+ size = get_kcore_size(&nphdr, &elf_buflen);
+
if (buflen == 0 || *fpos >= size) {
read_unlock(&kclist_lock);
return 0;
@@ -317,7 +484,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
struct kcore_list *m;
read_lock(&kclist_lock);
- for (m=kclist; m; m=m->next) {
+ list_for_each_entry(m, &kclist_head, list) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
@@ -326,45 +493,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (m == NULL) {
if (clear_user(buffer, tsz))
return -EFAULT;
- } else if (is_vmalloc_addr((void *)start)) {
+ } else if (is_vmalloc_or_module_addr((void *)start)) {
char * elf_buf;
- struct vm_struct *m;
- unsigned long curstart = start;
- unsigned long cursize = tsz;
elf_buf = kzalloc(tsz, GFP_KERNEL);
if (!elf_buf)
return -ENOMEM;
-
- read_lock(&vmlist_lock);
- for (m=vmlist; m && cursize; m=m->next) {
- unsigned long vmstart;
- unsigned long vmsize;
- unsigned long msize = m->size - PAGE_SIZE;
-
- if (((unsigned long)m->addr + msize) <
- curstart)
- continue;
- if ((unsigned long)m->addr > (curstart +
- cursize))
- break;
- vmstart = (curstart < (unsigned long)m->addr ?
- (unsigned long)m->addr : curstart);
- if (((unsigned long)m->addr + msize) >
- (curstart + cursize))
- vmsize = curstart + cursize - vmstart;
- else
- vmsize = (unsigned long)m->addr +
- msize - vmstart;
- curstart = vmstart + vmsize;
- cursize -= vmsize;
- /* don't dump ioremap'd stuff! (TA) */
- if (m->flags & VM_IOREMAP)
- continue;
- memcpy(elf_buf + (vmstart - start),
- (char *)vmstart, vmsize);
- }
- read_unlock(&vmlist_lock);
+ vread(elf_buf, (char *)start, tsz);
+ /* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, elf_buf, tsz)) {
kfree(elf_buf);
return -EFAULT;
@@ -402,12 +538,96 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
return acc;
}
+
+static int open_kcore(struct inode *inode, struct file *filp)
+{
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ if (kcore_need_update)
+ kcore_update_ram();
+ if (i_size_read(inode) != proc_root_kcore->size) {
+ mutex_lock(&inode->i_mutex);
+ i_size_write(inode, proc_root_kcore->size);
+ mutex_unlock(&inode->i_mutex);
+ }
+ return 0;
+}
+
+
+static const struct file_operations proc_kcore_operations = {
+ .read = read_kcore,
+ .open = open_kcore,
+};
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* just remember that we have to update kcore */
+static int __meminit kcore_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ write_lock(&kclist_lock);
+ kcore_need_update = 1;
+ write_unlock(&kclist_lock);
+ }
+ return NOTIFY_OK;
+}
+#endif
+
+
+static struct kcore_list kcore_vmalloc;
+
+#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
+static struct kcore_list kcore_text;
+/*
+ * If defined, special segment is used for mapping kernel text instead of
+ * direct-map area. We need to create special TEXT section.
+ */
+static void __init proc_kcore_text_init(void)
+{
+ kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+}
+#else
+static void __init proc_kcore_text_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+ MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
static int __init proc_kcore_init(void)
{
- proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
- if (proc_root_kcore)
- proc_root_kcore->size =
- (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
+ proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
+ &proc_kcore_operations);
+ if (!proc_root_kcore) {
+ printk(KERN_ERR "couldn't create /proc/kcore\n");
+ return 0; /* Always returns 0. */
+ }
+ /* Store text area if it's special */
+ proc_kcore_text_init();
+ /* Store vmalloc area */
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+ add_modules_range();
+ /* Store direct-map area from physical memory map */
+ kcore_update_ram();
+ hotplug_memory_notifier(kcore_callback, 0);
+
return 0;
}
module_init(proc_kcore_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..a65239cfd97e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
"Mapped: %8lu kB\n"
+ "Shmem: %8lu kB\n"
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
+ "KernelStack: %8lu kB\n"
"PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
"Quicklists: %8lu kB\n"
@@ -95,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n",
+ "VmallocChunk: %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+ "HardwareCorrupted: %5lu kB\n"
+#endif
+ ,
K(i.totalram),
K(i.freeram),
K(i.bufferram),
@@ -124,10 +130,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
+ K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
@@ -140,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+ ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7e14d1a04001..9fe7d7ebe115 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
return rb_next((struct rb_node *) v);
}
-static struct seq_operations proc_nommu_region_list_seqop = {
+static const struct seq_operations proc_nommu_region_list_seqop = {
.start = nommu_region_list_start,
.next = nommu_region_list_next,
.stop = nommu_region_list_stop,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..5033ce0d254b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
@@ -93,8 +94,11 @@ static const struct file_operations proc_kpagecount_operations = {
#define KPF_COMPOUND_TAIL 16
#define KPF_HUGE 17
#define KPF_UNEVICTABLE 18
+#define KPF_HWPOISON 19
#define KPF_NOPAGE 20
+#define KPF_KSM 21
+
/* kernel hacking assistances
* WARNING: subject to change, never rely on them!
*/
@@ -137,6 +141,8 @@ static u64 get_uflags(struct page *page)
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
+ if (PageKsm(page))
+ u |= 1 << KPF_KSM;
/*
* compound pages: export both head/tail info
@@ -175,6 +181,10 @@ static u64 get_uflags(struct page *page)
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
+#ifdef CONFIG_MEMORY_FAILURE
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+#endif
+
#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
#endif
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 7ba79a54948c..123257bb356b 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <asm/prom.h>
@@ -25,26 +26,27 @@ static struct proc_dir_entry *proc_device_tree;
/*
* Supply data on a read from /proc/device-tree/node/property.
*/
-static int property_read_proc(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int property_proc_show(struct seq_file *m, void *v)
{
- struct property *pp = data;
- int n;
+ struct property *pp = m->private;
- if (off >= pp->length) {
- *eof = 1;
- return 0;
- }
- n = pp->length - off;
- if (n > count)
- n = count;
- else
- *eof = 1;
- memcpy(page, (char *)pp->value + off, n);
- *start = page;
- return n;
+ seq_write(m, pp->value, pp->length);
+ return 0;
+}
+
+static int property_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, property_proc_show, PDE(inode)->data);
}
+static const struct file_operations property_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = property_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/*
* For a node with a name like "gc@10", we make symlinks called "gc"
* and "@10" to it.
@@ -63,10 +65,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
* Unfortunately proc_register puts each new entry
* at the beginning of the list. So we rearrange them.
*/
- ent = create_proc_read_entry(name,
- strncmp(name, "security-", 9)
- ? S_IRUGO : S_IRUSR, de,
- property_read_proc, pp);
+ ent = proc_create_data(name,
+ strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
+ de, &property_proc_fops, pp);
if (ent == NULL)
return NULL;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
/* careful: calling conventions are nasty here */
res = count;
- error = table->proc_handler(table, write, filp, buf, &res, ppos);
+ error = table->proc_handler(table, write, buf, &res, ppos);
if (!error)
error = res;
out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..2a1bef9203c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
} else if (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack) {
name = "[stack]";
+ } else {
+ unsigned long stack_start;
+ struct proc_maps_private *pmp;
+
+ pmp = m->private;
+ stack_start = pmp->task->stack_start;
+
+ if (vma->vm_start <= stack_start &&
+ vma->vm_end >= stack_start) {
+ pad_len_spaces(m, len);
+ seq_printf(m,
+ "[threadstack:%08lx]",
+#ifdef CONFIG_STACK_GROWSUP
+ vma->vm_end - stack_start
+#else
+ stack_start - vma->vm_start
+#endif
+ );
+ }
}
} else {
name = "[vdso]";
@@ -465,23 +484,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+#define CLEAR_REFS_ALL 1
+#define CLEAR_REFS_ANON 2
+#define CLEAR_REFS_MAPPED 3
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
+ char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
+ long type;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- if (!simple_strtol(buffer, &end, 0))
+ if (strict_strtol(strstrip(buffer), 10, &type))
+ return -EINVAL;
+ if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
return -EINVAL;
- if (*end == '\n')
- end++;
task = get_proc_task(file->f_path.dentry->d_inode);
if (!task)
return -ESRCH;
@@ -494,18 +518,31 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
clear_refs_walk.private = vma;
- if (!is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ *
+ * Writing 2 to /proc/pid/clear_refs only affects
+ * Anonymous pages.
+ *
+ * Writing 3 to /proc/pid/clear_refs only affects file
+ * mapped pages.
+ */
+ if (type == CLEAR_REFS_ANON && vma->vm_file)
+ continue;
+ if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ continue;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &clear_refs_walk);
}
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
}
put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
+
+ return count;
}
const struct file_operations proc_clear_refs_operations = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..766b1d456050 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -4,13 +4,18 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/kernel_stat.h>
#include <asm/cputime.h>
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec uptime;
struct timespec idle;
- cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+ int i;
+ cputime_t idletime = cputime_zero;
+
+ for_each_possible_cpu(i)
+ idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
do_posix_clock_monotonic_gettime(&uptime);
monotonic_to_bootbased(&uptime);
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index be8e0e1445b6..5f6089994042 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -6,20 +6,9 @@ config QNX4FS_FS
QNX 4 and QNX 6 (the latter is also called QNX RTP).
Further information is available at <http://www.qnx.com/>.
Say Y if you intend to mount QNX hard disks or floppies.
- Unless you say Y to "QNX4FS read-write support" below, you will
- only be able to read these file systems.
To compile this file system support as a module, choose M here: the
module will be called qnx4.
If you don't know whether you need it, then you don't need it:
answer N.
-
-config QNX4FS_RW
- bool "QNX4FS write support (DANGEROUS)"
- depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
- help
- Say Y if you want to test write support for QNX4 file systems.
-
- It's currently broken, so for now:
- answer N.
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index e4d408cc5473..4a283b3f87f8 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_QNX4FS_FS) += qnx4.o
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
+qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index e1cd061a25f7..0afba069d567 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -78,84 +78,3 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
return total_free;
}
-
-#ifdef CONFIG_QNX4FS_RW
-
-int qnx4_is_free(struct super_block *sb, long block)
-{
- int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
- int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
- struct buffer_head *bh;
- const char *g;
- int ret = -EIO;
-
- start += block / (QNX4_BLOCK_SIZE * 8);
- QNX4DEBUG(("qnx4: is_free requesting block [%lu], bitmap in block [%lu]\n",
- (unsigned long) block, (unsigned long) start));
- (void) size; /* CHECKME */
- bh = sb_bread(sb, start);
- if (bh == NULL) {
- return -EIO;
- }
- g = bh->b_data + (block % QNX4_BLOCK_SIZE);
- if (((*g) & (1 << (block % 8))) == 0) {
- QNX4DEBUG(("qnx4: is_free -> block is free\n"));
- ret = 1;
- } else {
- QNX4DEBUG(("qnx4: is_free -> block is busy\n"));
- ret = 0;
- }
- brelse(bh);
-
- return ret;
-}
-
-int qnx4_set_bitmap(struct super_block *sb, long block, int busy)
-{
- int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
- int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
- struct buffer_head *bh;
- char *g;
-
- start += block / (QNX4_BLOCK_SIZE * 8);
- QNX4DEBUG(("qnx4: set_bitmap requesting block [%lu], bitmap in block [%lu]\n",
- (unsigned long) block, (unsigned long) start));
- (void) size; /* CHECKME */
- bh = sb_bread(sb, start);
- if (bh == NULL) {
- return -EIO;
- }
- g = bh->b_data + (block % QNX4_BLOCK_SIZE);
- if (busy == 0) {
- (*g) &= ~(1 << (block % 8));
- } else {
- (*g) |= (1 << (block % 8));
- }
- mark_buffer_dirty(bh);
- brelse(bh);
-
- return 0;
-}
-
-static void qnx4_clear_inode(struct inode *inode)
-{
- struct qnx4_inode_entry *qnx4_ino = qnx4_raw_inode(inode);
- /* What for? */
- memset(qnx4_ino->di_fname, 0, sizeof qnx4_ino->di_fname);
- qnx4_ino->di_size = 0;
- qnx4_ino->di_num_xtnts = 0;
- qnx4_ino->di_mode = 0;
- qnx4_ino->di_status = 0;
-}
-
-void qnx4_free_inode(struct inode *inode)
-{
- if (inode->i_ino < 1) {
- printk("free_inode: inode 0 or nonexistent inode\n");
- return;
- }
- qnx4_clear_inode(inode);
- clear_inode(inode);
-}
-
-#endif
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 003c68f3238b..86cc39cb1398 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -85,9 +85,4 @@ const struct file_operations qnx4_dir_operations =
const struct inode_operations qnx4_dir_inode_operations =
{
.lookup = qnx4_lookup,
-#ifdef CONFIG_QNX4FS_RW
- .create = qnx4_create,
- .unlink = qnx4_unlink,
- .rmdir = qnx4_rmdir,
-#endif
};
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
deleted file mode 100644
index 09b170ac936c..000000000000
--- a/fs/qnx4/file.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * QNX4 file system, Linux implementation.
- *
- * Version : 0.2.1
- *
- * Using parts of the xiafs filesystem.
- *
- * History :
- *
- * 25-05-1998 by Richard Frowijn : first release.
- * 21-06-1998 by Frank Denis : wrote qnx4_readpage to use generic_file_read.
- * 27-06-1998 by Frank Denis : file overwriting.
- */
-
-#include "qnx4.h"
-
-/*
- * We have mostly NULL's here: the current defaults are ok for
- * the qnx4 filesystem.
- */
-const struct file_operations qnx4_file_operations =
-{
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .mmap = generic_file_mmap,
- .splice_read = generic_file_splice_read,
-#ifdef CONFIG_QNX4FS_RW
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
- .fsync = simple_fsync,
-#endif
-};
-
-const struct inode_operations qnx4_file_inode_operations =
-{
-#ifdef CONFIG_QNX4FS_RW
- .truncate = qnx4_truncate,
-#endif
-};
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 681df5fcd161..d2cd1798d8c4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -28,73 +28,6 @@
static const struct super_operations qnx4_sops;
-#ifdef CONFIG_QNX4FS_RW
-
-static void qnx4_delete_inode(struct inode *inode)
-{
- QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
- truncate_inode_pages(&inode->i_data, 0);
- inode->i_size = 0;
- qnx4_truncate(inode);
- lock_kernel();
- qnx4_free_inode(inode);
- unlock_kernel();
-}
-
-static int qnx4_write_inode(struct inode *inode, int do_sync)
-{
- struct qnx4_inode_entry *raw_inode;
- int block, ino;
- struct buffer_head *bh;
- ino = inode->i_ino;
-
- QNX4DEBUG(("qnx4: write inode 1.\n"));
- if (inode->i_nlink == 0) {
- return 0;
- }
- if (!ino) {
- printk("qnx4: bad inode number on dev %s: %d is out of range\n",
- inode->i_sb->s_id, ino);
- return -EIO;
- }
- QNX4DEBUG(("qnx4: write inode 2.\n"));
- block = ino / QNX4_INODES_PER_BLOCK;
- lock_kernel();
- if (!(bh = sb_bread(inode->i_sb, block))) {
- printk("qnx4: major problem: unable to read inode from dev "
- "%s\n", inode->i_sb->s_id);
- unlock_kernel();
- return -EIO;
- }
- raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
- (ino % QNX4_INODES_PER_BLOCK);
- raw_inode->di_mode = cpu_to_le16(inode->i_mode);
- raw_inode->di_uid = cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->di_gid = cpu_to_le16(fs_high2lowgid(inode->i_gid));
- raw_inode->di_nlink = cpu_to_le16(inode->i_nlink);
- raw_inode->di_size = cpu_to_le32(inode->i_size);
- raw_inode->di_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- raw_inode->di_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
- mark_buffer_dirty(bh);
- if (do_sync) {
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk("qnx4: IO error syncing inode [%s:%08x]\n",
- inode->i_sb->s_id, ino);
- brelse(bh);
- unlock_kernel();
- return -EIO;
- }
- }
- brelse(bh);
- unlock_kernel();
- return 0;
-}
-
-#endif
-
static void qnx4_put_super(struct super_block *sb);
static struct inode *qnx4_alloc_inode(struct super_block *sb);
static void qnx4_destroy_inode(struct inode *inode);
@@ -108,10 +41,6 @@ static const struct super_operations qnx4_sops =
.put_super = qnx4_put_super,
.statfs = qnx4_statfs,
.remount_fs = qnx4_remount,
-#ifdef CONFIG_QNX4FS_RW
- .write_inode = qnx4_write_inode,
- .delete_inode = qnx4_delete_inode,
-#endif
};
static int qnx4_remount(struct super_block *sb, int *flags, char *data)
@@ -120,15 +49,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
qs = qnx4_sb(sb);
qs->Version = QNX4_VERSION;
-#ifndef CONFIG_QNX4FS_RW
*flags |= MS_RDONLY;
-#endif
- if (*flags & MS_RDONLY) {
- return 0;
- }
-
- mark_buffer_dirty(qs->sb_buf);
-
return 0;
}
@@ -354,9 +275,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
}
s->s_op = &qnx4_sops;
s->s_magic = QNX4_SUPER_MAGIC;
-#ifndef CONFIG_QNX4FS_RW
s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
-#endif
qnx4_sb(s)->sb_buf = bh;
qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
@@ -489,8 +408,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
if (S_ISREG(inode->i_mode)) {
- inode->i_op = &qnx4_file_inode_operations;
- inode->i_fop = &qnx4_file_operations;
+ inode->i_fop = &generic_ro_fops;
inode->i_mapping->a_ops = &qnx4_aops;
qnx4_i(inode)->mmu_private = inode->i_size;
} else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 5972ed214937..ae1e7edbacd6 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -134,108 +134,3 @@ out:
return NULL;
}
-
-#ifdef CONFIG_QNX4FS_RW
-int qnx4_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
-{
- QNX4DEBUG(("qnx4: qnx4_create\n"));
- if (dir == NULL) {
- return -ENOENT;
- }
- return -ENOSPC;
-}
-
-int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct buffer_head *bh;
- struct qnx4_inode_entry *de;
- struct inode *inode;
- int retval;
- int ino;
-
- QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name));
- lock_kernel();
- bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
- &de, &ino);
- if (bh == NULL) {
- unlock_kernel();
- return -ENOENT;
- }
- inode = dentry->d_inode;
- if (inode->i_ino != ino) {
- retval = -EIO;
- goto end_rmdir;
- }
-#if 0
- if (!empty_dir(inode)) {
- retval = -ENOTEMPTY;
- goto end_rmdir;
- }
-#endif
- if (inode->i_nlink != 2) {
- QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink));
- }
- QNX4DEBUG(("qnx4: deleting directory\n"));
- de->di_status = 0;
- memset(de->di_fname, 0, sizeof de->di_fname);
- de->di_mode = 0;
- mark_buffer_dirty_inode(bh, dir);
- clear_nlink(inode);
- mark_inode_dirty(inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
- inode_dec_link_count(dir);
- retval = 0;
-
- end_rmdir:
- brelse(bh);
-
- unlock_kernel();
- return retval;
-}
-
-int qnx4_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct buffer_head *bh;
- struct qnx4_inode_entry *de;
- struct inode *inode;
- int retval;
- int ino;
-
- QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name));
- lock_kernel();
- bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
- &de, &ino);
- if (bh == NULL) {
- unlock_kernel();
- return -ENOENT;
- }
- inode = dentry->d_inode;
- if (inode->i_ino != ino) {
- retval = -EIO;
- goto end_unlink;
- }
- retval = -EPERM;
- if (!inode->i_nlink) {
- QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n",
- inode->i_sb->s_id,
- inode->i_ino, inode->i_nlink));
- inode->i_nlink = 1;
- }
- de->di_status = 0;
- memset(de->di_fname, 0, sizeof de->di_fname);
- de->di_mode = 0;
- mark_buffer_dirty_inode(bh, dir);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
- mark_inode_dirty(dir);
- inode->i_ctime = dir->i_ctime;
- inode_dec_link_count(inode);
- retval = 0;
-
-end_unlink:
- unlock_kernel();
- brelse(bh);
-
- return retval;
-}
-#endif
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 9efc089454f6..33a60858203b 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -29,17 +29,9 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-extern const struct inode_operations qnx4_file_inode_operations;
extern const struct inode_operations qnx4_dir_inode_operations;
-extern const struct file_operations qnx4_file_operations;
extern const struct file_operations qnx4_dir_operations;
extern int qnx4_is_free(struct super_block *sb, long block);
-extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
-extern void qnx4_truncate(struct inode *inode);
-extern void qnx4_free_inode(struct inode *inode);
-extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
-extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
{
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
deleted file mode 100644
index d94d9ee241fe..000000000000
--- a/fs/qnx4/truncate.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * QNX4 file system, Linux implementation.
- *
- * Version : 0.1
- *
- * Using parts of the xiafs filesystem.
- *
- * History :
- *
- * 30-06-1998 by Frank DENIS : ugly filler.
- */
-
-#include <linux/smp_lock.h>
-#include "qnx4.h"
-
-#ifdef CONFIG_QNX4FS_RW
-
-void qnx4_truncate(struct inode *inode)
-{
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode))) {
- return;
- }
- lock_kernel();
- if (!(S_ISDIR(inode->i_mode))) {
- /* TODO */
- }
- QNX4DEBUG(("qnx4: qnx4_truncate called\n"));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- unlock_kernel();
-}
-
-#endif
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..353e78a9ebee 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
config QUOTA_NETLINK_INTERFACE
bool "Report quota messages through netlink interface"
- depends on QUOTA && NET
+ depends on QUOTACTL && NET
help
If you say Y here, quota warnings (about exceeding softlimit, reaching
hardlimit, etc.) will be reported through netlink interface. If unsure,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..f97c1d9b2c48 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
#include <asm/uaccess.h>
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
}
#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "VFS_DQUOT",
- .version = 1,
- .maxattr = QUOTA_NL_A_MAX,
-};
-
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
- static atomic_t seq;
- struct sk_buff *skb;
- void *msg_head;
- int ret;
- int msg_size = 4 * nla_total_size(sizeof(u32)) +
- 2 * nla_total_size(sizeof(u64));
-
- /* We have to allocate using GFP_NOFS as we are called from a
- * filesystem performing write and thus further recursion into
- * the fs to free some data could cause deadlocks. */
- skb = genlmsg_new(msg_size, GFP_NOFS);
- if (!skb) {
- printk(KERN_ERR
- "VFS: Not enough memory to send quota warning.\n");
- return;
- }
- msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
- &quota_genl_family, 0, QUOTA_NL_C_WARNING);
- if (!msg_head) {
- printk(KERN_ERR
- "VFS: Cannot store netlink header in quota warning.\n");
- goto err_out;
- }
- ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
- MAJOR(dquot->dq_sb->s_dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
- MINOR(dquot->dq_sb->s_dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
- if (ret)
- goto attr_err_out;
- genlmsg_end(skb, msg_head);
-
- genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
- return;
-attr_err_out:
- printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
- kfree_skb(skb);
-}
-#endif
/*
* Write warnings to the console and send warning messages over netlink.
*
@@ -1145,18 +1074,20 @@ err_out:
*/
static void flush_warnings(struct dquot *const *dquots, char *warntype)
{
+ struct dquot *dq;
int i;
- for (i = 0; i < MAXQUOTAS; i++)
- if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
- !warning_issued(dquots[i], warntype[i])) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = dquots[i];
+ if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+ !warning_issued(dq, warntype[i])) {
#ifdef CONFIG_PRINT_QUOTA_WARNING
- print_warning(dquots[i], warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
- send_warning(dquots[i], warntype[i]);
+ print_warning(dq, warntype[i]);
#endif
+ quota_send_warning(dq->dq_type, dq->dq_id,
+ dq->dq_sb->s_dev, warntype[i]);
}
+ }
}
static int ignore_hardlimit(struct dquot *dquot)
@@ -1839,7 +1770,7 @@ EXPORT_SYMBOL(dquot_commit_info);
/*
* Definitions of diskquota operations.
*/
-struct dquot_operations dquot_operations = {
+const struct dquot_operations dquot_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -2233,7 +2164,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
+ mutex_lock(&sb->s_root->d_inode->i_mutex);
dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
+ mutex_unlock(&sb->s_root->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -2461,7 +2394,7 @@ out:
}
EXPORT_SYMBOL(vfs_set_dqinfo);
-struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops vfs_quotactl_ops = {
.quota_on = vfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -2607,12 +2540,6 @@ static int __init dquot_init(void)
register_shrinker(&dqcache_shrinker);
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
- if (genl_register_family(&quota_genl_family) != 0)
- printk(KERN_ERR
- "VFS: Failed to create quota netlink interface.\n");
-#endif
-
return 0;
}
module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..ee91e2756950 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
/* Check validity of generic quotactl commands */
static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
return ret;
}
#endif
+
+
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "VFS_DQUOT",
+ .version = 1,
+ .maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+ const char warntype)
+{
+ static atomic_t seq;
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+ int msg_size = 4 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u64));
+
+ /* We have to allocate using GFP_NOFS as we are called from a
+ * filesystem performing write and thus further recursion into
+ * the fs to free some data could cause deadlocks. */
+ skb = genlmsg_new(msg_size, GFP_NOFS);
+ if (!skb) {
+ printk(KERN_ERR
+ "VFS: Not enough memory to send quota warning.\n");
+ return;
+ }
+ msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+ &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+ if (!msg_head) {
+ printk(KERN_ERR
+ "VFS: Cannot store netlink header in quota warning.\n");
+ goto err_out;
+ }
+ ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+ if (ret)
+ goto attr_err_out;
+ genlmsg_end(skb, msg_head);
+
+ genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+ return;
+attr_err_out:
+ printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+ if (genl_register_family(&quota_genl_family) != 0)
+ printk(KERN_ERR
+ "VFS: Failed to create quota netlink interface.\n");
+ return 0;
+};
+
+module_init(quota_init);
+#endif
+
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 0edcf42b1778..2ae757e9c008 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -204,7 +204,7 @@ out:
return ret;
}
-static struct quota_format_ops v1_format_ops = {
+static const struct quota_format_ops v1_format_ops = {
.check_quota_file = v1_check_quota_file,
.read_file_info = v1_read_file_info,
.write_file_info = v1_write_file_info,
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index a5475fb1ae44..01f25eae684d 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -207,7 +207,7 @@ static int v2_free_file_info(struct super_block *sb, int type)
return 0;
}
-static struct quota_format_ops v2_format_ops = {
+static const struct quota_format_ops v2_format_ops = {
.check_quota_file = v2_check_quota_file,
.read_file_info = v2_read_file_info,
.write_file_info = v2_write_file_info,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 11f0c06316de..32fae4040ebf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
if (unlikely(order >= MAX_ORDER))
- goto too_big;
+ return -EFBIG;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && newsize > limit)
- goto fsize_exceeded;
-
- if (newsize > inode->i_sb->s_maxbytes)
- goto too_big;
+ ret = inode_newsize_ok(inode, newsize);
+ if (ret)
+ return ret;
i_size_write(inode, newsize);
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
return 0;
- fsize_exceeded:
- send_sig(SIGXFSZ, current, 0);
- too_big:
- return -EFBIG;
-
- add_error:
+add_error:
while (loop < npages)
__free_page(pages + loop++);
return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a6090aa1a7c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,18 +34,17 @@
#include <linux/ramfs.h>
#include <linux/sched.h>
#include <linux/parser.h>
+#include <linux/magic.h>
#include <asm/uaccess.h>
#include "internal.h"
-/* some random number */
-#define RAMFS_MAGIC 0x858458f6
-
#define RAMFS_DEFAULT_MODE 0755
static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
static struct backing_dev_info ramfs_backing_dev_info = {
+ .name = "ramfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/read_write.c b/fs/read_write.c
index 6c8c55dec2bc..3ac28987f22a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
pos = *ppos;
- retval = -EINVAL;
- if (unlikely(pos < 0))
- goto fput_out;
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
if (pos >= max)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 58727b5b4351..339b0baf2af6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -615,7 +615,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
static int reiserfs_write_info(struct super_block *, int);
static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
-static struct dquot_operations reiserfs_quota_operations = {
+static const struct dquot_operations reiserfs_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -632,7 +632,7 @@ static struct dquot_operations reiserfs_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops reiserfs_qctl_operations = {
+static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208adf8e71..71e2b4d50a0a 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd)
- return romfs_mtd_strnlen(sb, pos, limit);
+ return romfs_mtd_strnlen(sb, pos, maxlen);
#endif
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev)
- return romfs_blk_strnlen(sb, pos, limit);
+ return romfs_blk_strnlen(sb, pos, maxlen);
#endif
return -EIO;
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
.readdir = romfs_readdir,
};
-static struct inode_operations romfs_dir_inode_operations = {
+static const struct inode_operations romfs_dir_inode_operations = {
.lookup = romfs_lookup,
};
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
root = romfs_iget(sb, pos);
- if (!root)
+ if (IS_ERR(root))
goto error;
sb->s_root = d_alloc_root(root);
diff --git a/fs/select.c b/fs/select.c
index 8084834e123e..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -41,22 +42,28 @@
* better solutions..
*/
+#define MAX_SLACK (100 * NSEC_PER_MSEC)
+
static long __estimate_accuracy(struct timespec *tv)
{
long slack;
int divfactor = 1000;
+ if (tv->tv_sec < 0)
+ return 0;
+
if (task_nice(current) > 0)
divfactor = divfactor / 5;
+ if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
+ return MAX_SLACK;
+
slack = tv->tv_nsec / divfactor;
slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
- if (slack > 100 * NSEC_PER_MSEC)
- slack = 100 * NSEC_PER_MSEC;
+ if (slack > MAX_SLACK)
+ return MAX_SLACK;
- if (slack < 0)
- slack = 0;
return slack;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6c959275f2d0..eae7d9dbf3ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
*/
int seq_path(struct seq_file *m, struct path *path, char *esc)
{
- if (m->count < m->size) {
- char *s = m->buf + m->count;
- char *p = d_path(path, s, m->size - m->count);
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -1;
+
+ if (size) {
+ char *p = d_path(path, buf, size);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return s - p;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
}
}
- m->count = m->size;
- return -1;
+ seq_commit(m, res);
+
+ return res;
}
EXPORT_SYMBOL(seq_path);
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *esc)
{
- int err = -ENAMETOOLONG;
- if (m->count < m->size) {
- char *s = m->buf + m->count;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -ENAMETOOLONG;
+
+ if (size) {
char *p;
spin_lock(&dcache_lock);
- p = __d_path(path, root, s, m->size - m->count);
+ p = __d_path(path, root, buf, size);
spin_unlock(&dcache_lock);
- err = PTR_ERR(p);
+ res = PTR_ERR(p);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return 0;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
+ else
+ res = -ENAMETOOLONG;
}
}
- m->count = m->size;
- return err;
+ seq_commit(m, res);
+
+ return res < 0 ? res : 0;
}
/*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
*/
int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
{
- if (m->count < m->size) {
- char *s = m->buf + m->count;
- char *p = dentry_path(dentry, s, m->size - m->count);
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -1;
+
+ if (size) {
+ char *p = dentry_path(dentry, buf, size);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return s - p;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
}
}
- m->count = m->size;
- return -1;
+ seq_commit(m, res);
+
+ return res;
}
int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1402d2d54f52..1c4c8f089970 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
static void
smb_unload_nls(struct smb_sb_info *server)
{
- if (server->remote_nls) {
- unload_nls(server->remote_nls);
- server->remote_nls = NULL;
- }
- if (server->local_nls) {
- unload_nls(server->local_nls);
- server->local_nls = NULL;
- }
+ unload_nls(server->remote_nls);
+ unload_nls(server->local_nls);
}
static void
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 9468168b9af5..71c29b6670b4 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -509,7 +509,7 @@ date_unix2dos(struct smb_sb_info *server,
month = 2;
} else {
nl_day = (year & 3) || day <= 59 ? day : day - 1;
- for (month = 0; month < 12; month++)
+ for (month = 1; month < 12; month++)
if (day_n[month] > nl_day)
break;
}
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
len = left;
ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
- if (ret > 0)
+ if (ret > 0) {
*ppos += ret;
+ file_accessed(in);
+ }
return ret;
}
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ret = file_remove_suid(out);
- if (!ret)
+ if (!ret) {
+ file_update_time(out);
ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+ }
mutex_unlock(&inode->i_mutex);
} while (ret > 0);
splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (ret > 0) {
unsigned long nr_pages;
+ int err;
- *ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- mutex_lock(&inode->i_mutex);
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- mutex_unlock(&inode->i_mutex);
-
- if (err)
- ret = err;
- }
+ err = generic_write_sync(out, *ppos, ret);
+ if (err)
+ ret = err;
+ else
+ *ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
#include "squashfs.h"
static struct file_system_type squashfs_fs_type;
-static struct super_operations squashfs_super_ops;
+static const struct super_operations squashfs_super_ops;
static int supported_squashfs_filesystem(short major, short minor, short comp)
{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV
};
-static struct super_operations squashfs_super_ops = {
+static const struct super_operations squashfs_super_ops = {
.alloc_inode = squashfs_alloc_inode,
.destroy_inode = squashfs_destroy_inode,
.statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..19eb70b374bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
static struct super_block *alloc_super(struct file_system_type *type)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
- static struct super_operations default_op;
+ static const struct super_operations default_op;
if (s) {
if (security_sb_alloc(s)) {
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
s = NULL;
goto out;
}
- INIT_LIST_HEAD(&s->s_dirty);
- INIT_LIST_HEAD(&s->s_io);
- INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -468,6 +465,48 @@ rescan:
}
EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given. Returns the superblock with an active
+ * reference and s_umount held exclusively or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ if (!bdev)
+ return NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdev != bdev)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_write(&sb->s_umount);
+ if (sb->s_root) {
+ spin_lock(&sb_lock);
+ if (sb->s_count > S_BIAS) {
+ atomic_inc(&sb->s_active);
+ sb->s_count--;
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ spin_unlock(&sb_lock);
+ }
+ up_write(&sb->s_umount);
+ put_super(sb);
+ yield();
+ spin_lock(&sb_lock);
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
struct super_block * user_get_super(dev_t dev)
{
@@ -530,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
int retval;
int remount_rw;
-
+
+ if (sb->s_frozen != SB_UNFROZEN)
+ return -EBUSY;
+
#ifdef CONFIG_BLOCK
if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
+
if (flags & MS_RDONLY)
acct_auto_close(sb);
shrink_dcache_sb(sb);
@@ -710,6 +753,12 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
@@ -740,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
* will protect the lockfs code from trying to start a snapshot
* while we are mounting
*/
- down(&bdev->bd_mount_sem);
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = -EBUSY;
+ goto error_bdev;
+ }
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
- up(&bdev->bd_mount_sem);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (IS_ERR(s))
goto error_s;
@@ -889,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
if (error)
goto out_sb;
+ /*
+ * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+ * but s_maxbytes was an unsigned long long for many releases. Throw
+ * this warning for a little while to try and catch filesystems that
+ * violate this rule. This warning should be either removed or
+ * converted to a BUG() in 2.6.34.
+ */
+ WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+ "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..36752a683481 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
*/
static int __sync_filesystem(struct super_block *sb, int wait)
{
+ /*
+ * This should be safe, as we require bdi backing to actually
+ * write out data in the first place
+ */
+ if (!sb->s_bdi)
+ return 0;
+
/* Avoid doing twice syncing and cache pruning for quota sync */
- if (!wait)
+ if (!wait) {
writeout_quota_sb(sb, -1);
- else
+ writeback_inodes_sb(sb);
+ } else {
sync_quota_sb(sb, -1);
- sync_inodes_sb(sb, wait);
+ sync_inodes_sb(sb);
+ }
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+ if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
__sync_filesystem(sb, wait);
up_read(&sb->s_umount);
@@ -118,7 +127,7 @@ restart:
*/
SYSCALL_DEFINE0(sync)
{
- wakeup_pdflush(0);
+ wakeup_flusher_threads(0);
sync_filesystems(0);
sync_filesystems(1);
if (unlikely(laptop_mode))
@@ -174,21 +183,26 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
ret = err;
return ret;
}
+EXPORT_SYMBOL(file_fsync);
/**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
* @dentry: dentry of @file
- * @data: only perform a fdatasync operation
+ * @start: offset in bytes of the beginning of data range to sync
+ * @end: offset in bytes of the end of data range (inclusive)
+ * @datasync: perform only datasync
*
- * Write back data and metadata for @file to disk. If @datasync is
- * set only metadata needed to access modified file data is written.
+ * Write back data in range @start..@end and metadata for @file to disk. If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+ loff_t end, int datasync)
{
const struct file_operations *fop;
struct address_space *mapping;
@@ -212,7 +226,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
- ret = filemap_fdatawrite(mapping);
+ ret = filemap_write_and_wait_range(mapping, start, end);
/*
* We need to protect against concurrent writers, which could cause
@@ -223,12 +237,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
- err = filemap_fdatawait(mapping);
- if (!ret)
- ret = err;
+
out:
return ret;
}
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file: file to sync
+ * @dentry: dentry of @file
+ * @datasync: only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk. If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set. This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +285,24 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file: file to which the write happened
+ * @pos: offset where the write started
+ * @count: length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+ if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
+ return 0;
+ return vfs_fsync_range(file, file->f_path.dentry, pos,
+ pos + count - 1,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
@@ -404,9 +453,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
- ret = wait_on_page_writeback_range(mapping,
- offset >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ ret = filemap_fdatawait_range(mapping, offset, endbyte);
if (ret < 0)
goto out;
}
@@ -419,9 +466,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
}
if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
- ret = wait_on_page_writeback_range(mapping,
- offset >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ ret = filemap_fdatawait_range(mapping, offset, endbyte);
}
out:
return ret;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece1..60c702bc10ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
struct mutex mutex;
void *buffer;
int mmapped;
- struct vm_operations_struct *vm_ops;
+ const struct vm_operations_struct *vm_ops;
struct file *file;
struct hlist_node list;
};
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
}
#endif
-static struct vm_operations_struct bin_vm_ops = {
+static const struct vm_operations_struct bin_vm_ops = {
.open = bin_vma_open,
.close = bin_vma_close,
.fault = bin_fault,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..e0201837d244 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,6 +21,7 @@
#include <linux/completion.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "sysfs.h"
DEFINE_MUTEX(sysfs_mutex);
@@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
sysfs_put(sd->s_symlink.target_sd);
if (sysfs_type(sd) & SYSFS_COPY_NAME)
kfree(sd->s_name);
+ if (sd->s_iattr && sd->s_iattr->ia_secdata)
+ security_release_secctx(sd->s_iattr->ia_secdata,
+ sd->s_iattr->ia_secdata_len);
kfree(sd->s_iattr);
sysfs_free_ino(sd->s_ino);
kmem_cache_free(sysfs_dir_cachep, sd);
@@ -760,6 +764,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
const struct inode_operations sysfs_dir_inode_operations = {
.lookup = sysfs_lookup,
.setattr = sysfs_setattr,
+ .setxattr = sysfs_setxattr,
};
static void remove_dir(struct sysfs_dirent *sd)
@@ -893,7 +898,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
mutex_lock(&sysfs_rename_mutex);
BUG_ON(!sd->s_parent);
- new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
+ new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+ new_parent_kobj->sd : &sysfs_root;
error = 0;
if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c050cef..f5ea4680f15f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
struct sysfs_open_dirent *od, *new_od = NULL;
retry:
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irq(&sysfs_open_dirent_lock);
if (!sd->s_attr.open && new_od) {
sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
list_add_tail(&buffer->list, &od->buffers);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irq(&sysfs_open_dirent_lock);
if (od) {
kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
struct sysfs_buffer *buffer)
{
struct sysfs_open_dirent *od = sd->s_attr.open;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
list_del(&buffer->list);
if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
else
od = NULL;
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
kfree(od);
}
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
void sysfs_notify_dirent(struct sysfs_dirent *sd)
{
struct sysfs_open_dirent *od;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
od = sd->s_attr.open;
if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
wake_up_interruptible(&od->poll);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
}
EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
#include "sysfs.h"
extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
};
static struct backing_dev_info sysfs_backing_dev_info = {
+ .name = "sysfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static const struct inode_operations sysfs_inode_operations ={
.setattr = sysfs_setattr,
+ .setxattr = sysfs_setxattr,
};
int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
return bdi_init(&sysfs_backing_dev_info);
}
+struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+ struct sysfs_inode_attrs *attrs;
+ struct iattr *iattrs;
+
+ attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+ iattrs = &attrs->ia_iattr;
+
+ /* assign default attributes */
+ iattrs->ia_mode = sd->s_mode;
+ iattrs->ia_uid = 0;
+ iattrs->ia_gid = 0;
+ iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+ return attrs;
+}
int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
{
struct inode * inode = dentry->d_inode;
struct sysfs_dirent * sd = dentry->d_fsdata;
- struct iattr * sd_iattr;
+ struct sysfs_inode_attrs *sd_attrs;
+ struct iattr *iattrs;
unsigned int ia_valid = iattr->ia_valid;
int error;
if (!sd)
return -EINVAL;
- sd_iattr = sd->s_iattr;
+ sd_attrs = sd->s_iattr;
error = inode_change_ok(inode, iattr);
if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
if (error)
return error;
- if (!sd_iattr) {
+ if (!sd_attrs) {
/* setting attributes for the first time, allocate now */
- sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
- if (!sd_iattr)
+ sd_attrs = sysfs_init_inode_attrs(sd);
+ if (!sd_attrs)
return -ENOMEM;
- /* assign default attributes */
- sd_iattr->ia_mode = sd->s_mode;
- sd_iattr->ia_uid = 0;
- sd_iattr->ia_gid = 0;
- sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
- sd->s_iattr = sd_iattr;
+ sd->s_iattr = sd_attrs;
+ } else {
+ /* attributes were changed at least once in past */
+ iattrs = &sd_attrs->ia_iattr;
+
+ if (ia_valid & ATTR_UID)
+ iattrs->ia_uid = iattr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ iattrs->ia_gid = iattr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MTIME)
+ iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_CTIME)
+ iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ iattrs->ia_mode = sd->s_mode = mode;
+ }
}
+ return error;
+}
- /* attributes were changed atleast once in past */
-
- if (ia_valid & ATTR_UID)
- sd_iattr->ia_uid = iattr->ia_uid;
- if (ia_valid & ATTR_GID)
- sd_iattr->ia_gid = iattr->ia_gid;
- if (ia_valid & ATTR_ATIME)
- sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MTIME)
- sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_CTIME)
- sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
- sd_iattr->ia_mode = sd->s_mode = mode;
- }
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct sysfs_dirent *sd = dentry->d_fsdata;
+ struct sysfs_inode_attrs *iattrs;
+ void *secdata;
+ int error;
+ u32 secdata_len = 0;
+
+ if (!sd)
+ return -EINVAL;
+ if (!sd->s_iattr)
+ sd->s_iattr = sysfs_init_inode_attrs(sd);
+ if (!sd->s_iattr)
+ return -ENOMEM;
+
+ iattrs = sd->s_iattr;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+ const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+ error = security_inode_setsecurity(dentry->d_inode, suffix,
+ value, size, flags);
+ if (error)
+ goto out;
+ error = security_inode_getsecctx(dentry->d_inode,
+ &secdata, &secdata_len);
+ if (error)
+ goto out;
+ if (iattrs->ia_secdata)
+ security_release_secctx(iattrs->ia_secdata,
+ iattrs->ia_secdata_len);
+ iattrs->ia_secdata = secdata;
+ iattrs->ia_secdata_len = secdata_len;
+ } else
+ return -EINVAL;
+out:
return error;
}
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct bin_attribute *bin_attr;
+ struct sysfs_inode_attrs *iattrs;
inode->i_private = sysfs_get(sd);
inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
inode->i_ino = sd->s_ino;
lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
- if (sd->s_iattr) {
+ iattrs = sd->s_iattr;
+ if (iattrs) {
/* sysfs_dirent has non-default attributes
* get them for the new inode from persistent copy
* in sysfs_dirent
*/
- set_inode_attr(inode, sd->s_iattr);
+ set_inode_attr(inode, &iattrs->ia_iattr);
+ if (iattrs->ia_secdata)
+ security_inode_notifysecctx(inode,
+ iattrs->ia_secdata,
+ iattrs->ia_secdata_len);
} else
set_default_inode_attr(inode, sd->s_mode);
-
/* initialize inode according to type */
switch (sysfs_type(sd)) {
case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/mutex.h>
+#include <linux/security.h>
#include "sysfs.h"
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
}
const struct inode_operations sysfs_symlink_inode_operations = {
+ .setxattr = sysfs_setxattr,
.readlink = generic_readlink,
.follow_link = sysfs_follow_link,
.put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#include <linux/fs.h>
+
struct sysfs_open_dirent;
/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
struct hlist_head buffers;
};
+struct sysfs_inode_attrs {
+ struct iattr ia_iattr;
+ void *ia_secdata;
+ u32 ia_secdata_len;
+};
+
/*
* sysfs_dirent - the building block of sysfs hierarchy. Each and
* every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
unsigned int s_flags;
ino_t s_ino;
umode_t s_mode;
- struct iattr *s_iattr;
+ struct sysfs_inode_attrs *s_iattr;
};
#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
void sysfs_delete_inode(struct inode *inode);
int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags);
int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
int sysfs_inode_init(void);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
* @nr_to_write: how many dirty pages to write-back
*
* This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
*
* Note, this function synchronizes even VFS inodes which are locked
* (@i_mutex) by the caller of the budgeting function, because write-back does
* not touch @i_mutex.
*/
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
- int nr_written;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .range_end = LLONG_MAX,
- .nr_to_write = nr_to_write,
- };
-
- generic_sync_sb_inodes(c->vfs_sb, &wbc);
- nr_written = nr_to_write - wbc.nr_to_write;
-
- if (!nr_written) {
- /*
- * Re-try again but wait on pages/inodes which are being
- * written-back concurrently (e.g., by pdflush).
- */
- memset(&wbc, 0, sizeof(struct writeback_control));
- wbc.sync_mode = WB_SYNC_ALL;
- wbc.range_end = LLONG_MAX;
- wbc.nr_to_write = nr_to_write;
- generic_sync_sb_inodes(c->vfs_sb, &wbc);
- nr_written = nr_to_write - wbc.nr_to_write;
- }
-
- dbg_budg("%d pages were written back", nr_written);
- return nr_written;
+ writeback_inodes_sb(c->vfs_sb);
}
/**
@@ -741,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
* ubifs_get_free_space - return amount of free space.
* @c: UBIFS file-system description object
*
- * This function calculates and retuns amount of free space to report to
+ * This function calculates and returns amount of free space to report to
* user-space.
*/
long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
int first = 1, iip;
struct ubifs_debug_info *d = c->dbg;
- union ubifs_key lower_key, upper_key, l_key, u_key;
+ union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
unsigned long long uninitialized_var(last_sqnum);
struct ubifs_idx_node *idx;
struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
}
}
+const char *dbg_jhead(int jhead)
+{
+ switch (jhead) {
+ case GCHD:
+ return "0 (GC)";
+ case BASEHD:
+ return "1 (base)";
+ case DATAHD:
+ return "2 (data)";
+ default:
+ return "unknown journal head";
+ }
+}
+
static void dump_ch(const struct ubifs_ch *ch)
{
printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
- printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
- c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+ printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+ dbg_jhead(c->jheads[i].wbuf.jhead),
+ c->jheads[i].wbuf.lnum);
for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
bud = rb_entry(rb, struct ubifs_bud, rb);
printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
{
- printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
- "flags %#x\n", lp->lnum, lp->free, lp->dirty,
- c->leb_size - lp->free - lp->dirty, lp->flags);
+ int i, spc, dark = 0, dead = 0;
+ struct rb_node *rb;
+ struct ubifs_bud *bud;
+
+ spc = lp->free + lp->dirty;
+ if (spc < c->dead_wm)
+ dead = spc;
+ else
+ dark = ubifs_calc_dark(c, spc);
+
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
+ lp->dirty, c->leb_size - spc, spc, lp->flags);
+ else
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
+ "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
+ c->leb_size - spc, spc, dark, dead,
+ (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+
+ if (lp->flags & LPROPS_TAKEN) {
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_CONT "index, taken");
+ else
+ printk(KERN_CONT "taken");
+ } else {
+ const char *s;
+
+ if (lp->flags & LPROPS_INDEX) {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_DIRTY_IDX:
+ s = "dirty index";
+ break;
+ case LPROPS_FRDI_IDX:
+ s = "freeable index";
+ break;
+ default:
+ s = "index";
+ }
+ } else {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_UNCAT:
+ s = "not categorized";
+ break;
+ case LPROPS_DIRTY:
+ s = "dirty";
+ break;
+ case LPROPS_FREE:
+ s = "free";
+ break;
+ case LPROPS_EMPTY:
+ s = "empty";
+ break;
+ case LPROPS_FREEABLE:
+ s = "freeable";
+ break;
+ default:
+ s = NULL;
+ break;
+ }
+ }
+ printk(KERN_CONT "%s", s);
+ }
+
+ for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
+ bud = rb_entry(rb, struct ubifs_bud, rb);
+ if (bud->lnum == lp->lnum) {
+ int head = 0;
+ for (i = 0; i < c->jhead_cnt; i++) {
+ if (lp->lnum == c->jheads[i].wbuf.lnum) {
+ printk(KERN_CONT ", jhead %s",
+ dbg_jhead(i));
+ head = 1;
+ }
+ }
+ if (!head)
+ printk(KERN_CONT ", bud of jhead %s",
+ dbg_jhead(bud->jhead));
+ }
+ }
+ if (lp->lnum == c->gc_lnum)
+ printk(KERN_CONT ", GC LEB");
+ printk(KERN_CONT ")\n");
}
void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
ubifs_err("scan error %d", (int)PTR_ERR(sleb));
return;
@@ -909,8 +1005,10 @@ out:
ubifs_msg("saved lprops statistics dump");
dbg_dump_lstats(&d->saved_lst);
ubifs_get_lp_stats(c, &lst);
+
ubifs_msg("current lprops statistics dump");
- dbg_dump_lstats(&d->saved_lst);
+ dbg_dump_lstats(&lst);
+
spin_lock(&c->space_lock);
dbg_dump_budg(c);
spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
/* Dump functions */
const char *dbg_ntype(int type);
const char *dbg_cstate(int cmt_state);
+const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
int dbg_check_lprops(struct ubifs_info *c);
int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
int row, int col);
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size);
/* Force the use of in-the-gaps method for testing */
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_ntype(type) ""
#define dbg_cstate(cmt_state) ""
+#define dbg_jhead(jhead) ""
#define dbg_get_key_dump(c, key) ({})
#define dbg_dump_inode(c, inode) ({})
#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_check_heap(c, heap, cat, add_pos) ({})
#define dbg_check_lprops(c) 0
#define dbg_check_lpt_nodes(c, cnode, row, col) 0
+#define dbg_check_inode_size(c, inode, size) 0
#define dbg_force_in_the_gaps_enabled 0
#define dbg_force_in_the_gaps() 0
#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..39849f887e72 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
*/
/*
- * This file implements VFS file and inode operations of regular files, device
+ * This file implements VFS file and inode operations for regular files, device
* nodes and symlinks as well as address space operations.
*
- * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
- * page is dirty and is used for budgeting purposes - dirty pages should not be
- * budgeted. The PG_checked flag is set if full budgeting is required for the
- * page e.g., when it corresponds to a file hole or it is just beyond the file
- * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
- * fail in this function, and the budget is released in 'ubifs_write_end()'. So
- * the PG_private and PG_checked flags carry the information about how the page
- * was budgeted, to make it possible to release the budget properly.
+ * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
+ * the page is dirty and is used for optimization purposes - dirty pages are
+ * not budgeted so the flag shows that 'ubifs_write_end()' should not release
+ * the budget for this page. The @PG_checked flag is set if full budgeting is
+ * required for the page e.g., when it corresponds to a file hole or it is
+ * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
+ * it is OK to fail in this function, and the budget is released in
+ * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
+ * information about how the page was budgeted, to make it possible to release
+ * the budget properly.
*
- * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
- * we implement. However, this is not true for '->writepage()', which might be
- * called with 'i_mutex' unlocked. For example, when pdflush is performing
- * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
- * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
- * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
- * path'. So, in '->writepage()' we are only guaranteed that the page is
- * locked.
+ * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
+ * implement. However, this is not true for 'ubifs_writepage()', which may be
+ * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * we are only guaranteed that the page is locked.
*
- * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
- * readahead path does not have it locked ("sys_read -> generic_file_aio_read
- * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
- * not set as well. However, UBIFS disables readahead.
- *
- * This, for example means that there might be 2 concurrent '->writepage()'
- * calls for the same inode, but different inode dirty pages.
+ * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
+ * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
+ * set as well. However, UBIFS disables readahead.
*/
#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
/*
* We change whole page so no need to load it. But we
* have to set the @PG_checked flag to make the further
- * code the page is new. This might be not true, but it
- * is better to budget more that to read the page from
- * the media.
+ * code know that the page is new. This might be not
+ * true, but it is better to budget more than to read
+ * the page from the media.
*/
SetPageChecked(page);
skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * Whee, we aquired budgeting quickly - without involving
- * garbage-collection, committing or forceing write-back. We return
+ * Whee, we acquired budgeting quickly - without involving
+ * garbage-collection, committing or forcing write-back. We return
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
/*
* Return 0 to force VFS to repeat the whole operation, or the
- * error code if 'do_readpage()' failes.
+ * error code if 'do_readpage()' fails.
*/
copied = do_readpage(page);
goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
- /* The other attributes may be changed at the same time as well */
+ /* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
-
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
mutex_unlock(&ui->ui_mutex);
+
out_budg:
if (budgeted)
ubifs_release_budget(c, &req);
@@ -1391,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
int err;
- ssize_t ret;
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1399,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
return err;
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- if (ret < 0)
- return ret;
-
- if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
- err = ubifs_sync_wbufs_by_inode(c, inode);
- if (err)
- return err;
- }
-
- return ret;
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static int ubifs_set_page_dirty(struct page *page)
@@ -1536,7 +1523,7 @@ out_unlock:
return err;
}
-static struct vm_operations_struct ubifs_file_vm_ops = {
+static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ubifs_vm_page_mkwrite,
};
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
* We scan the entire LEB even though we only really need to scan up to
* (c->leb_size - lp->free).
*/
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
{
struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
- dbg_io("jhead %d", wbuf->jhead);
+ dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
wbuf->need_sync = 1;
wbuf->c->need_wbuf_sync = 1;
ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
if (wbuf->no_timer)
return;
- dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
+ dbg_io("set timer for jhead %s, %llu-%llu millisecs",
+ dbg_jhead(wbuf->jhead),
div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
/* Write-buffer is empty or not seeked */
return 0;
- dbg_io("LEB %d:%d, %d bytes, jhead %d",
- wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
+ dbg_io("LEB %d:%d, %d bytes, jhead %s",
+ wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
ubifs_assert(!(wbuf->avail & 7));
ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
{
const struct ubifs_info *c = wbuf->c;
- dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
+ dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
ubifs_assert(offs >= 0 && offs <= c->leb_size);
ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
struct ubifs_info *c = wbuf->c;
int err, written, n, aligned_len = ALIGN(len, 8), offs;
- dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
- dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
- wbuf->lnum, wbuf->offs + wbuf->used);
+ dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
+ dbg_ntype(((struct ubifs_ch *)buf)->node_type),
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
memcpy(wbuf->buf + wbuf->used, buf, len);
if (aligned_len == wbuf->avail) {
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
wbuf->offs, c->min_io_size,
wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
* minimal I/O unit. We have to fill and flush write-buffer and switch
* to the next min. I/O unit.
*/
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int err, rlen, overlap;
struct ubifs_ch *ch = buf;
- dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
- dbg_ntype(type), len, wbuf->jhead);
+ dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
+ dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(!(offs & 7) && offs < c->leb_size);
ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
* some. But the write-buffer mutex has to be unlocked because
* GC also takes it.
*/
- dbg_jnl("no free space jhead %d, run GC", jhead);
+ dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
mutex_unlock(&wbuf->io_mutex);
lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
* because we dropped @wbuf->io_mutex, so try once
* again.
*/
- dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+ dbg_jnl("GC couldn't make a free LEB for jhead %s",
+ dbg_jhead(jhead));
if (retries++ < 2) {
dbg_jnl("retry (%d)", retries);
goto again;
@@ -184,7 +185,7 @@ again:
}
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
- dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+ dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
avail = c->leb_size - wbuf->offs - wbuf->used;
if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
ubifs_prepare_node(c, node, len, 0);
return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
err = ubifs_wbuf_write_nolock(wbuf, buf, len);
if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
}
/**
- * xent_key_init_hash - initialize extended attribute entry key without
- * re-calculating hash function.
- * @c: UBIFS file-system description object
- * @key: key to initialize
- * @inum: host inode number
- * @hash: extended attribute entry name hash
- */
-static inline void xent_key_init_hash(const struct ubifs_info *c,
- union ubifs_key *key, ino_t inum,
- uint32_t hash)
-{
- ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
- key->u32[0] = inum;
- key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
-}
-
-/**
* xent_key_init_flash - initialize on-flash extended attribute entry key.
* @c: UBIFS file-system description object
* @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
}
/**
- * data_key_init_flash - initialize on-flash data key.
+ * highest_data_key - get the highest possible data key for an inode.
* @c: UBIFS file-system description object
- * @k: key to initialize
+ * @key: key to initialize
* @inum: inode number
- * @block: block number
*/
-static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, unsigned int block)
+static inline void highest_data_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
{
- union ubifs_key *key = k;
-
- ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
- key->j32[0] = cpu_to_le32(inum);
- key->j32[1] = cpu_to_le32(block |
- (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
- memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+ data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
}
/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
return 0;
}
}
+
#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
*/
c->bud_bytes += c->leb_size - bud->start;
- dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
- bud->start, bud->jhead, c->bud_bytes);
+ dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
+ bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
spin_unlock(&c->buds_lock);
}
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
* heads (non-closed buds).
*/
c->cmt_bud_bytes += wbuf->offs - bud->start;
- dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, wbuf->offs - bud->start,
+ dbg_jhead(bud->jhead), wbuf->offs - bud->start,
c->cmt_bud_bytes);
bud->start = wbuf->offs;
} else {
c->cmt_bud_bytes += c->leb_size - bud->start;
- dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, c->leb_size - bud->start,
+ dbg_jhead(bud->jhead), c->leb_size - bud->start,
c->cmt_bud_bytes);
rb_erase(p1, &c->buds);
/*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
if (lnum == -1 || offs == c->leb_size)
continue;
- dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+ dbg_log("add ref to LEB %d:%d for jhead %s",
+ lnum, offs, dbg_jhead(i));
ref = buf + len;
ref->ch.node_type = UBIFS_REF_NODE;
ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
lnum = c->ltail_lnum;
write_lnum = lnum;
while (1) {
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
case LPROPS_FREE:
if (add_to_lpt_heap(c, lprops, cat))
break;
- /* No more room on heap so make it uncategorized */
+ /* No more room on heap so make it un-categorized */
cat = LPROPS_UNCAT;
/* Fall through */
case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
* @lprops: LEB properties
*
* A LEB may have fallen off of the bottom of a heap, and ended up as
- * uncategorized even though it has enough space for us now. If that is the case
- * this function will put the LEB back onto a heap.
+ * un-categorized even though it has enough space for us now. If that is the
+ * case this function will put the LEB back onto a heap.
*/
void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
/**
* change_category - change LEB properties category.
* @c: UBIFS file-system description object
- * @lprops: LEB properties to recategorize
+ * @lprops: LEB properties to re-categorize
*
* LEB properties are categorized to enable fast find operations. When the LEB
- * properties change they must be recategorized.
+ * properties change they must be re-categorized.
*/
static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
}
/**
- * calc_dark - calculate LEB dark space size.
+ * ubifs_calc_dark - calculate LEB dark space size.
* @c: the UBIFS file-system description object
* @spc: amount of free and dirty space in the LEB
*
- * This function calculates amount of dark space in an LEB which has @spc bytes
- * of free and dirty space. Returns the calculations result.
+ * This function calculates and returns amount of dark space in an LEB which
+ * has @spc bytes of free and dirty space.
*
- * Dark space is the space which is not always usable - it depends on which
- * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
- * it is dark space, because it cannot fit a large data node. So UBIFS cannot
- * count on this LEB and treat these 512 bytes as usable because it is not true
- * if, for example, only big chunks of uncompressible data will be written to
- * the FS.
+ * UBIFS is trying to account the space which might not be usable, and this
+ * space is called "dark space". For example, if an LEB has only %512 free
+ * bytes, it is dark space, because it cannot fit a large data node.
*/
-static int calc_dark(struct ubifs_info *c, int spc)
+int ubifs_calc_dark(const struct ubifs_info *c, int spc)
{
ubifs_assert(!(spc & 7));
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
* @free: new free space amount
* @dirty: new dirty space amount
* @flags: new flags
- * @idx_gc_cnt: change to the count of idx_gc list
+ * @idx_gc_cnt: change to the count of @idx_gc list
*
* This function changes LEB properties (@free, @dirty or @flag). However, the
* property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
{
/*
* This is the only function that is allowed to change lprops, so we
- * discard the const qualifier.
+ * discard the "const" qualifier.
*/
struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (old_spc < c->dead_wm)
c->lst.total_dead -= old_spc;
else
- c->lst.total_dark -= calc_dark(c, old_spc);
+ c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
c->lst.total_used -= c->leb_size - old_spc;
}
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (new_spc < c->dead_wm)
c->lst.total_dead += new_spc;
else
- c->lst.total_dark += calc_dark(c, new_spc);
+ c->lst.total_dark += ubifs_calc_dark(c, new_spc);
c->lst.total_used += c->leb_size - new_spc;
}
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
/*
* After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->empty_lebs += 1;
lst->total_free += c->leb_size;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->total_free += lp->free;
lst->total_dirty += lp->dirty;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (spc < c->dead_wm)
lst->total_dead += spc;
else
- lst->total_dark += calc_dark(c, spc);
+ lst->total_dark += ubifs_calc_dark(c, spc);
}
ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
* @c: UBIFS file-system description object
*
* This function scans the master node LEBs and search for the latest master
- * node. Returns zero in case of success and a negative error code in case of
+ * node. Returns zero in case of success, %-EUCLEAN if there master area is
+ * corrupted and requires recovery, and a negative error code in case of
* failure.
*/
static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum = UBIFS_MST_LNUM;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
memcpy(c->mst_node, snod->node, snod->len);
offs = snod->offs;
}
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum += 1;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
goto out;
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
if (snod->offs != offs)
goto out;
if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
out:
ubifs_scan_destroy(sleb);
+ return -EUCLEAN;
+
+out_dump:
+ ubifs_err("unexpected node type %d master LEB %d:%d",
+ snod->type, lnum, snod->offs);
+ ubifs_scan_destroy(sleb);
return -EINVAL;
}
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
err = scan_for_master(c);
if (err) {
- err = ubifs_recover_master_node(c);
+ if (err == -EUCLEAN)
+ err = ubifs_recover_master_node(c);
if (err)
/*
* Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
struct ubifs_scan_leb *sleb;
dbg_rcvry("LEB %d", lnum);
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb)) {
- sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ if (PTR_ERR(sleb) == -EUCLEAN)
+ sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
struct ubifs_scan_leb *sleb;
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
mst = mst2;
}
- dbg_rcvry("recovered master node from LEB %d",
+ ubifs_msg("recovered master node from LEB %d",
(mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
* We can only recover at the end of the log, so check that the
* next log LEB is empty or out of date.
*/
- sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+ sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
if (IS_ERR(sleb))
return sleb;
if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
if (c->need_recovery)
sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
else
- sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+ sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
const struct ubifs_cs_node *node;
dbg_mnt("replay log LEB %d:%d", lnum, offs);
- sleb = ubifs_scan(c, lnum, offs, sbuf);
- if (IS_ERR(sleb) ) {
+ sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
+ if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
return PTR_ERR(sleb);
sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
/* Make the node pads to 8-byte boundary */
if ((node_len + pad_len) & 7) {
- if (!quiet) {
+ if (!quiet)
dbg_err("bad padding length %d - %d",
offs, offs + node_len + pad_len);
- }
return SCANNED_A_BAD_PAD_NODE;
}
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
* @c: UBIFS file-system description object
* @lnum: logical eraseblock number
* @offs: offset to start at (usually zero)
- * @sbuf: scan buffer (must be c->leb_size)
+ * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
+ * @quiet: print no messages
*
* This function scans LEB number @lnum and returns complete information about
* its contents. Returns the scaned information in case of success and,
* %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
* of failure.
+ *
+ * If @quiet is non-zero, this function does not print large and scary
+ * error messages and flash dumps in case of errors.
*/
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf)
+ int offs, void *sbuf, int quiet)
{
void *buf = sbuf + offs;
int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
cond_resched();
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
if (ret > 0) {
/* Padding bytes or a valid padding node */
offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
}
if (offs % c->min_io_size) {
- ubifs_err("empty space starts at non-aligned offset %d", offs);
+ if (!quiet)
+ ubifs_err("empty space starts at non-aligned offset %d",
+ offs);
goto corrupted;;
}
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
break;
for (; len; offs++, buf++, len--)
if (*(uint8_t *)buf != 0xff) {
- ubifs_err("corrupt empty space at LEB %d:%d",
- lnum, offs);
+ if (!quiet)
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
goto corrupted;
}
return sleb;
corrupted:
- ubifs_scanned_corruption(c, lnum, offs, buf);
+ if (!quiet) {
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ ubifs_err("LEB %d scanning failed", lnum);
+ }
err = -EUCLEAN;
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+
error:
- ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_err("LEB %d scanning failed, error %d", lnum, err);
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..943ad5624530 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
-#include <linux/smp_lock.h>
#include "ubifs.h"
/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
if (err)
ubifs_err("can't write inode %lu, error %d",
inode->i_ino, err);
+ else
+ err = dbg_check_inode_size(c, inode, ui->ui_size);
}
ui->dirty = 0;
@@ -438,12 +439,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
{
int i, err;
struct ubifs_info *c = sb->s_fs_info;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .nr_to_write = LONG_MAX,
- };
/*
* Zero @wait is just an advisory thing to help the file system shove
@@ -454,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return 0;
/*
- * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
- * pages, so synchronize them first, then commit the journal. Strictly
- * speaking, it is not necessary to commit the journal here,
- * synchronizing write-buffers would be enough. But committing makes
- * UBIFS free space predictions much more accurate, so we want to let
- * the user be able to get more accurate results of 'statfs()' after
- * they synchronize the file system.
- */
- generic_sync_sb_inodes(sb, &wbc);
-
- /*
* Synchronize write buffers, because 'ubifs_run_commit()' does not
* do this if it waits for an already running commit.
*/
@@ -474,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return err;
}
+ /*
+ * Strictly speaking, it is not necessary to commit the journal here,
+ * synchronizing write-buffers would be enough. But committing makes
+ * UBIFS free space predictions much more accurate, so we want to let
+ * the user be able to get more accurate results of 'statfs()' after
+ * they synchronize the file system.
+ */
err = ubifs_run_commit(c);
if (err)
return err;
@@ -1726,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
c->vi.vol_id);
- lock_kernel();
-
/*
* The following asserts are only valid if there has not been a failure
* of the media. For example, there will be dirty inodes if we failed
@@ -1792,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
kfree(c);
-
- unlock_kernel();
}
static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1809,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
return err;
}
- lock_kernel();
if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
err = ubifs_remount_rw(c);
- if (err) {
- unlock_kernel();
+ if (err)
return err;
- }
} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
ubifs_remount_ro(c);
@@ -1839,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
}
ubifs_assert(c->lst.taken_empty_lebs > 0);
- unlock_kernel();
return 0;
}
@@ -1861,22 +1842,32 @@ const struct super_operations ubifs_super_operations = {
* @name: UBI volume name
* @mode: UBI volume open mode
*
- * There are several ways to specify UBI volumes when mounting UBIFS:
- * o ubiX_Y - UBI device number X, volume Y;
- * o ubiY - UBI device number 0, volume Y;
+ * The primary method of mounting UBIFS is by specifying the UBI volume
+ * character device node path. However, UBIFS may also be mounted withoug any
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y - mount UBI device number X, volume Y;
+ * o ubiY - mount UBI device number 0, volume Y;
* o ubiX:NAME - mount UBI device X, volume with name NAME;
* o ubi:NAME - mount UBI device 0, volume with name NAME.
*
* Alternative '!' separator may be used instead of ':' (because some shells
* like busybox may interpret ':' as an NFS host name separator). This function
- * returns ubi volume object in case of success and a negative error code in
- * case of failure.
+ * returns UBI volume description object in case of success and a negative
+ * error code in case of failure.
*/
static struct ubi_volume_desc *open_ubi(const char *name, int mode)
{
+ struct ubi_volume_desc *ubi;
int dev, vol;
char *endptr;
+ /* First, try to open using the device node path method */
+ ubi = ubi_open_volume_path(name, mode);
+ if (!IS_ERR(ubi))
+ return ubi;
+
+ /* Try the "nodev" method */
if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
return ERR_PTR(-EINVAL);
@@ -1971,6 +1962,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
*
* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
*/
+ c->bdi.name = "ubifs",
c->bdi.capabilities = BDI_CAP_MAP_COPY;
c->bdi.unplug_io_fn = default_unplug_io_fn;
err = bdi_init(&c->bdi);
@@ -1985,6 +1977,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_bdi;
+ sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
* o exact match, i.e. the found zero-level znode contains key @key, then %1
* is returned and slot number of the matched branch is stored in @n;
* o not exact match, which means that zero-level znode does not contain
- * @key, then %0 is returned and slot number of the closed branch is stored
- * in @n;
+ * @key, then %0 is returned and slot number of the closest branch is stored
+ * in @n;
* o @key is so small that it is even less than the lowest key of the
* leftmost zero-level node, then %0 is returned and %0 is stored in @n.
*
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
* @lnum: LEB number is returned here
* @offs: offset is returned here
*
- * This function look up and reads node with key @key. The caller has to make
+ * This function looks up and reads node with key @key. The caller has to make
* sure the @node buffer is large enough to fit the node. Returns zero in case
* of success, %-ENOENT if the node was not found, and a negative error code in
* case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
mutex_unlock(&c->tnc_mutex);
return err;
}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_inode_size - check if inode size is correct.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @size: inode size
+ *
+ * This function makes sure that the inode size (@size) is correct and it does
+ * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
+ * if it has a data page beyond @size, and other negative error code in case of
+ * other errors.
+ */
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size)
+{
+ int err, n;
+ union ubifs_key from_key, to_key, *key;
+ struct ubifs_znode *znode;
+ unsigned int block;
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+ data_key_init(c, &from_key, inode->i_ino, block);
+ highest_data_key(c, &to_key, inode->i_ino);
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, &from_key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err) {
+ err = -EINVAL;
+ key = &from_key;
+ goto out_dump;
+ }
+
+ err = tnc_next(c, &znode, &n);
+ if (err == -ENOENT) {
+ err = 0;
+ goto out_unlock;
+ }
+ if (err < 0)
+ goto out_unlock;
+
+ ubifs_assert(err == 0);
+ key = &znode->zbranch[n].key;
+ if (!key_in_range(c, key, &from_key, &to_key))
+ goto out_unlock;
+
+out_dump:
+ block = key_block(c, key);
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
+ "(data key %s)", (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ dbg_dump_inode(c, inode);
+ dbg_dump_stack();
+ err = -EINVAL;
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
* it is more comprehensive and less efficient than is needed for this
* purpose.
*/
- sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+ sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
c->ileb_len = 0;
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
/* The key is always at the same position in all keyed nodes */
#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+/* Garbage collector journal head number */
+#define UBIFS_GC_HEAD 0
+/* Base journal head number */
+#define UBIFS_BASE_HEAD 1
+/* Data journal head number */
+#define UBIFS_DATA_HEAD 2
+
/*
* LEB Properties Tree node types.
*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
/* Number of non-data journal heads */
#define NONDATA_JHEADS_CNT 2
-/* Garbage collector head */
-#define GCHD 0
-/* Base journal head number */
-#define BASEHD 1
-/* First "general purpose" journal head */
-#define DATAHD 2
+/* Shorter names for journal head numbers for internal usage */
+#define GCHD UBIFS_GC_HEAD
+#define BASEHD UBIFS_BASE_HEAD
+#define DATAHD UBIFS_DATA_HEAD
/* 'No change' value for 'ubifs_change_lp()' */
#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
/* scan.c */
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf);
+ int offs, void *sbuf, int quiet);
void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
SECURITY_XATTR,
};
-static struct inode_operations none_inode_operations;
-static struct address_space_operations none_address_operations;
-static struct file_operations none_file_operations;
+static const struct inode_operations none_inode_operations;
+static const struct address_space_operations none_address_operations;
+static const struct file_operations none_file_operations;
/**
* create_xattr - create an extended attribute.
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
- uint8_t ad_size, struct kernel_lb_addr fe_loc,
- int *pos, int *offset, struct buffer_head **bh,
- int *error)
-{
- int loffset = *offset;
- int block;
- uint8_t *ad;
- int remainder;
-
- *error = 0;
-
- ad = (uint8_t *)(*bh)->b_data + *offset;
- *offset += ad_size;
-
- if (!ad) {
- brelse(*bh);
- *error = 1;
- return NULL;
- }
-
- if (*offset == dir->i_sb->s_blocksize) {
- brelse(*bh);
- block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
- if (!block)
- return NULL;
- *bh = udf_tread(dir->i_sb, block);
- if (!*bh)
- return NULL;
- } else if (*offset > dir->i_sb->s_blocksize) {
- ad = tmpad;
-
- remainder = dir->i_sb->s_blocksize - loffset;
- memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-
- brelse(*bh);
- block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
- if (!block)
- return NULL;
- (*bh) = udf_tread(dir->i_sb, block);
- if (!*bh)
- return NULL;
-
- memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
- ad_size - remainder);
- *offset = ad_size - remainder;
- }
-
- return ad;
-}
-#endif
-
struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
return fi;
}
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
- struct extent_ad *ext;
- struct fileEntry *fe;
- uint8_t *ptr;
-
- if ((!buffer) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
- return NULL;
- }
-
- fe = (struct fileEntry *)buffer;
-
- if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
- udf_debug("0x%x != TAG_IDENT_FE\n",
- le16_to_cpu(fe->descTag.tagIdent));
- return NULL;
- }
-
- ptr = (uint8_t *)(fe->extendedAttr) +
- le32_to_cpu(fe->lengthExtendedAttr);
-
- if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
- ptr += *offset;
-
- ext = (struct extent_ad *)ptr;
-
- *offset = *offset + sizeof(struct extent_ad);
- return ext;
-}
-#endif
-
struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
int inc)
{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE) {
+ mutex_lock(&inode->i_mutex);
lock_kernel();
udf_discard_prealloc(inode);
unlock_kernel();
+ mutex_unlock(&inode->i_mutex);
}
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
}
/*
- * If we are going to release inode from memory, we discard preallocation and
- * truncate last inode extent to proper length. We could use drop_inode() but
- * it's called under inode_lock and thus we cannot mark inode dirty there. We
- * use clear_inode() but we have to make sure to write inode as it's not written
- * automatically.
+ * If we are going to release inode from memory, we truncate last inode extent
+ * to proper length. We could use drop_inode() but it's called under inode_lock
+ * and thus we cannot mark inode dirty there. We use clear_inode() but we have
+ * to make sure to write inode as it's not written automatically.
*/
void udf_clear_inode(struct inode *inode)
{
struct udf_inode_info *iinfo;
if (!(inode->i_sb->s_flags & MS_RDONLY)) {
lock_kernel();
- /* Discard preallocation for directories, symlinks, etc. */
- udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
unlock_kernel();
write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
#ifdef UDF_PREALLOCATE
- /* preallocate blocks */
- udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+ /* We preallocate blocks only for regular files. It also makes sense
+ * for directories but there's a problem when to drop the
+ * preallocation. We might use some delayed work for that but I feel
+ * it's overengineering for a filesystem like UDF. */
+ if (S_ISREG(inode->i_mode))
+ udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
#endif
/* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
ms_info.addr_format = CDROM_LBA;
i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
-
if (i == 0) {
udf_debug("XA disk: %s, vol_desc_start=%d\n",
(ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
vol_desc_start = ms_info.addr.lba;
} else {
udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
pc->componentType = 1;
pc->lengthComponentIdent = 0;
pc->componentFileVersionNum = 0;
- pc += sizeof(struct pathComponent);
elen += sizeof(struct pathComponent);
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return inode_permission(inode, mask);
}
-int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
+/**
+ * __vfs_setxattr_noperm - perform setxattr operation without performing
+ * permission checks.
+ *
+ * @dentry - object to perform setxattr on
+ * @name - xattr name to set
+ * @value - value to set @name to
+ * @size - size of @value
+ * @flags - flags to pass into filesystem operations
+ *
+ * returns the result of the internal setxattr or setsecurity operations.
+ *
+ * This function requires the caller to lock the inode's i_mutex before it
+ * is executed. It also assumes that the caller will make the appropriate
+ * permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- int error;
-
- error = xattr_permission(inode, name, MAY_WRITE);
- if (error)
- return error;
+ int error = -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
- error = security_inode_setxattr(dentry, name, value, size, flags);
- if (error)
- goto out;
- error = -EOPNOTSUPP;
if (inode->i_op->setxattr) {
error = inode->i_op->setxattr(dentry, name, value, size, flags);
if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!error)
fsnotify_xattr(dentry);
}
+
+ return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = xattr_permission(inode, name, MAY_WRITE);
+ if (error)
+ return error;
+
+ mutex_lock(&inode->i_mutex);
+ error = security_inode_setxattr(dentry, name, value, size, flags);
+ if (error)
+ goto out;
+
+ error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
out:
mutex_unlock(&inode->i_mutex);
return error;
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee9..05ac0fe9c4d3 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
acl_e = acl->a_entries;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..616ee3febcbb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
}
/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+ xfs_ioend_t *ioend)
+{
+ xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ xfs_fsize_t isize;
+ xfs_fsize_t bsize;
+
+ bsize = ioend->io_offset + ioend->io_size;
+ isize = MAX(ip->i_size, ip->i_new_size);
+ isize = MIN(isize, bsize);
+ return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
* Update on-disk file size now that data has been written to disk.
* The current in-memory file size is i_size. If a write is beyond
* eof i_new_size will be the intended file size until i_size is
* updated. If this write does not extend all the way to the valid
* file size then restrict this update to the end of the write.
*/
+
STATIC void
xfs_setfilesize(
xfs_ioend_t *ioend)
{
xfs_inode_t *ip = XFS_I(ioend->io_inode);
xfs_fsize_t isize;
- xfs_fsize_t bsize;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,17 +224,10 @@ xfs_setfilesize(
if (unlikely(ioend->io_error))
return;
- bsize = ioend->io_offset + ioend->io_size;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
-
- if (ip->i_d.di_size < isize) {
+ isize = xfs_ioend_new_eof(ioend);
+ if (isize) {
ip->i_d.di_size = isize;
- ip->i_update_core = 1;
- ip->i_update_size = 1;
xfs_mark_inode_dirty_sync(ip);
}
@@ -224,71 +235,36 @@ xfs_setfilesize(
}
/*
- * Buffered IO write completion for delayed allocate extents.
- */
-STATIC void
-xfs_end_bio_delalloc(
- struct work_struct *work)
-{
- xfs_ioend_t *ioend =
- container_of(work, xfs_ioend_t, io_work);
-
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * Buffered IO write completion for regular, written extents.
+ * IO write completion.
*/
STATIC void
-xfs_end_bio_written(
- struct work_struct *work)
-{
- xfs_ioend_t *ioend =
- container_of(work, xfs_ioend_t, io_work);
-
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * IO write completion for unwritten extents.
- *
- * Issue transactions to convert a buffer range from unwritten
- * to written extents.
- */
-STATIC void
-xfs_end_bio_unwritten(
+xfs_end_io(
struct work_struct *work)
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- xfs_off_t offset = ioend->io_offset;
- size_t size = ioend->io_size;
-
- if (likely(!ioend->io_error)) {
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- int error;
- error = xfs_iomap_write_unwritten(ip, offset, size);
- if (error)
- ioend->io_error = error;
- }
- xfs_setfilesize(ioend);
- }
- xfs_destroy_ioend(ioend);
-}
-/*
- * IO read completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_read(
- struct work_struct *work)
-{
- xfs_ioend_t *ioend =
- container_of(work, xfs_ioend_t, io_work);
+ /*
+ * For unwritten extents we need to issue transactions to convert a
+ * range to normal written extens after the data I/O has finished.
+ */
+ if (ioend->io_type == IOMAP_UNWRITTEN &&
+ likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+ int error;
+
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ if (error)
+ ioend->io_error = error;
+ }
+ /*
+ * We might have to update the on-disk file size after extending
+ * writes.
+ */
+ if (ioend->io_type != IOMAP_READ)
+ xfs_setfilesize(ioend);
xfs_destroy_ioend(ioend);
}
@@ -303,10 +279,10 @@ xfs_finish_ioend(
int wait)
{
if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq = xfsdatad_workqueue;
- if (ioend->io_work.func == xfs_end_bio_unwritten)
- wq = xfsconvertd_workqueue;
+ struct workqueue_struct *wq;
+ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ xfsconvertd_workqueue : xfsdatad_workqueue;
queue_work(wq, &ioend->io_work);
if (wait)
flush_workqueue(wq);
@@ -344,15 +320,7 @@ xfs_alloc_ioend(
ioend->io_offset = 0;
ioend->io_size = 0;
- if (type == IOMAP_UNWRITTEN)
- INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
- else if (type == IOMAP_DELAY)
- INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
- else if (type == IOMAP_READ)
- INIT_WORK(&ioend->io_work, xfs_end_bio_read);
- else
- INIT_WORK(&ioend->io_work, xfs_end_bio_written);
-
+ INIT_WORK(&ioend->io_work, xfs_end_io);
return ioend;
}
@@ -401,15 +369,23 @@ xfs_end_bio(
STATIC void
xfs_submit_ioend_bio(
- xfs_ioend_t *ioend,
- struct bio *bio)
+ struct writeback_control *wbc,
+ xfs_ioend_t *ioend,
+ struct bio *bio)
{
atomic_inc(&ioend->io_remaining);
-
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
- submit_bio(WRITE, bio);
+ /*
+ * If the I/O is beyond EOF we mark the inode dirty immediately
+ * but don't update the inode size until I/O completion.
+ */
+ if (xfs_ioend_new_eof(ioend))
+ xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+
+ submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC_PLUG : WRITE, bio);
ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
bio_put(bio);
}
@@ -488,6 +464,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
*/
STATIC void
xfs_submit_ioend(
+ struct writeback_control *wbc,
xfs_ioend_t *ioend)
{
xfs_ioend_t *head = ioend;
@@ -516,19 +493,19 @@ xfs_submit_ioend(
retry:
bio = xfs_alloc_ioend_bio(bh);
} else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(ioend, bio);
+ xfs_submit_ioend_bio(wbc, ioend, bio);
goto retry;
}
if (bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(ioend, bio);
+ xfs_submit_ioend_bio(wbc, ioend, bio);
goto retry;
}
lastblock = bh->b_blocknr;
}
if (bio)
- xfs_submit_ioend_bio(ioend, bio);
+ xfs_submit_ioend_bio(wbc, ioend, bio);
xfs_finish_ioend(ioend, 0);
} while ((ioend = next) != NULL);
}
@@ -1181,7 +1158,7 @@ xfs_page_state_convert(
}
if (iohead)
- xfs_submit_ioend(iohead);
+ xfs_submit_ioend(wbc, iohead);
return page_dirty;
@@ -1518,7 +1495,7 @@ xfs_end_io_direct(
* didn't map an unwritten extent so switch it's completion
* handler.
*/
- INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+ ioend->io_type = IOMAP_NEW;
xfs_finish_ioend(ioend, 0);
}
@@ -1636,4 +1613,5 @@ const struct address_space_operations xfs_address_space_operations = {
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..eff61e2732af 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
#include <linux/dcache.h>
-static struct vm_operations_struct xfs_file_vm_ops;
+static const struct vm_operations_struct xfs_file_vm_ops;
STATIC ssize_t
xfs_file_aio_read(
@@ -172,12 +172,14 @@ xfs_file_release(
*/
STATIC int
xfs_file_fsync(
- struct file *filp,
- struct dentry *dentry,
- int datasync)
+ struct file *file,
+ struct dentry *dentry,
+ int datasync)
{
- xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
- return -xfs_fsync(XFS_I(dentry->d_inode));
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return -xfs_fsync(ip);
}
STATIC int
@@ -271,7 +273,7 @@ const struct file_operations xfs_dir_file_operations = {
.fsync = xfs_file_fsync,
};
-static struct vm_operations_struct xfs_file_vm_ops = {
+static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = xfs_vm_page_mkwrite,
};
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..cd42ef78f6b5 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
#include "xfs_error.h"
#include "xfs_itable.h"
#include "xfs_rw.h"
-#include "xfs_acl.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -58,19 +57,22 @@
#include <linux/fiemap.h>
/*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
*/
void
-xfs_synchronize_atime(
+xfs_synchronize_times(
xfs_inode_t *ip)
{
struct inode *inode = VFS_I(ip);
- if (!(inode->i_state & I_CLEAR)) {
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- }
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
}
/*
@@ -107,32 +109,20 @@ xfs_ichgtime(
if ((flags & XFS_ICHGTIME_MOD) &&
!timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
if ((flags & XFS_ICHGTIME_CHG) &&
!timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
/*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
+ * Update complete - now make sure everyone knows that the inode
+ * is dirty.
*/
- if (sync_it) {
- SYNCHRONIZE();
- ip->i_update_core = 1;
+ if (sync_it)
xfs_mark_inode_dirty_sync(ip);
- }
}
/*
@@ -485,14 +475,6 @@ xfs_vn_put_link(
}
STATIC int
-xfs_vn_permission(
- struct inode *inode,
- int mask)
-{
- return generic_permission(inode, mask, xfs_check_acl);
-}
-
-STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
struct dentry *dentry,
@@ -515,10 +497,8 @@ xfs_vn_getattr(
stat->gid = ip->i_d.di_gid;
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
@@ -696,7 +676,7 @@ xfs_vn_fiemap(
}
static const struct inode_operations xfs_inode_operations = {
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.truncate = xfs_vn_truncate,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -724,7 +704,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -749,7 +729,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -762,7 +742,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = xfs_vn_follow_link,
.put_link = xfs_vn_put_link,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..339c52b1a434 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
xip->i_new_size = new_size;
if (likely(!(ioflags & IO_INVIS)))
- xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ file_update_time(file);
/*
* If the offset is beyond the size of the file, we have a couple
@@ -811,19 +811,22 @@ write_retry:
XFS_STATS_ADD(xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
int error2;
xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
- error2 = sync_page_range(inode, mapping, pos, ret);
+
+ error2 = filemap_write_and_wait_range(mapping, pos, end);
if (!error)
error = error2;
if (need_i_mutex)
mutex_lock(&inode->i_mutex);
xfs_ilock(xip, iolock);
- error2 = xfs_write_sync_logforce(mp, xip);
+
+ error2 = xfs_fsync(xip);
if (!error)
error = error2;
}
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..3d4a0c84d634 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -80,7 +80,7 @@ xfs_fs_set_xstate(
if (sb->s_flags & MS_RDONLY)
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
}
-struct quotactl_ops xfs_quotactl_operations = {
+const struct quotactl_ops xfs_quotactl_operations = {
.quota_sync = xfs_fs_quota_sync,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
DEFINE_PER_CPU(struct xfsstats, xfsstats);
-STATIC int
-xfs_read_xfsstats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
{
- int c, i, j, len, val;
+ int c, i, j, val;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
};
/* Loop over all stats groups */
- for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
- len += sprintf(buffer + len, "%s", xstats[i].desc);
+ for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ seq_printf(m, "%s", xstats[i].desc);
/* inner loop does each group */
while (j < xstats[i].endpoint) {
val = 0;
/* sum over all cpus */
for_each_possible_cpu(c)
val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- len += sprintf(buffer + len, " %u", val);
+ seq_printf(m, " %u", val);
j++;
}
- buffer[len++] = '\n';
+ seq_putc(m, '\n');
}
/* extra precision counters */
for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
}
- len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+ seq_printf(m, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += sprintf(buffer + len, "debug %u\n",
+ seq_printf(m, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xfs_stat_proc_show, NULL);
}
+static const struct file_operations xfs_stat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xfs_stat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
goto out;
- if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
- xfs_read_xfsstats, NULL))
+ if (!proc_create("fs/xfs/stat", 0, NULL,
+ &xfs_stat_proc_fops))
goto out_remove_entry;
return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..25b6903a3bce 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
#include <linux/freezer.h>
#include <linux/parser.h>
-static struct super_operations xfs_super_operations;
+static const struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
@@ -579,15 +579,19 @@ xfs_showargs(
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, "," MNTOPT_UQUOTANOENF);
- if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_PRJQUOTA);
- else if (mp->m_qflags & XFS_PQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
-
- if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_GRPQUOTA);
- else if (mp->m_qflags & XFS_GQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ /* Either project or group quotas can be active, not both */
+
+ if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_PRJQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_GRPQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ }
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
return error;
}
-void
+STATIC void
xfs_mountfs_check_barriers(xfs_mount_t *mp)
{
int error;
@@ -926,13 +930,39 @@ xfs_fs_alloc_inode(
*/
STATIC void
xfs_fs_destroy_inode(
- struct inode *inode)
+ struct inode *inode)
{
- xfs_inode_t *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ xfs_itrace_entry(ip);
XFS_STATS_INC(vn_reclaim);
- if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+
+ /* bad inode, get out here ASAP */
+ if (is_bad_inode(inode))
+ goto out_reclaim;
+
+ xfs_ioend_wait(ip);
+
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+ /*
+ * We should never get here with one of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+ * If we have nothing to flush with this inode then complete the
+ * teardown now, otherwise delay the flush operation.
+ */
+ if (!xfs_inode_clean(ip)) {
+ xfs_inode_set_reclaim_tag(ip);
+ return;
+ }
+
+out_reclaim:
+ xfs_ireclaim(ip);
}
/*
@@ -969,7 +999,28 @@ xfs_fs_inode_init_once(
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+}
+
+/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode. Care must be taken
+ * here - the transaction code calls mark_inode_dirty_sync() to mark the
+ * VFS inode dirty in a transaction and clears the i_update_core field;
+ * it must clear the field after calling mark_inode_dirty_sync() to
+ * correctly indicate that the dirty state has been propagated into the
+ * inode log item.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+ struct inode *inode)
+{
+ barrier();
+ XFS_I(inode)->i_update_core = 1;
}
/*
@@ -1049,6 +1100,20 @@ xfs_fs_clear_inode(
XFS_STATS_INC(vn_remove);
XFS_STATS_DEC(vn_active);
+ /*
+ * The iolock is used by the file system to coordinate reads,
+ * writes, and block truncates. Up to this point the lock
+ * protected concurrent accesses by users of the inode. But
+ * from here forward we're doing some final processing of the
+ * inode because we're done with it, and although we reuse the
+ * iolock for protection it is really a distinct lock class
+ * (in the lockdep sense) from before. To keep lockdep happy
+ * (and basically indicate what we are doing), we explicitly
+ * re-init the iolock here.
+ */
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
xfs_inactive(ip);
}
@@ -1122,7 +1187,7 @@ xfs_fs_put_super(
}
STATIC int
-xfs_fs_sync_super(
+xfs_fs_sync_fs(
struct super_block *sb,
int wait)
{
@@ -1130,23 +1195,23 @@ xfs_fs_sync_super(
int error;
/*
- * Treat a sync operation like a freeze. This is to work
- * around a race in sync_inodes() which works in two phases
- * - an asynchronous flush, which can write out an inode
- * without waiting for file size updates to complete, and a
- * synchronous flush, which wont do anything because the
- * async flush removed the inode's dirty flag. Also
- * sync_inodes() will not see any files that just have
- * outstanding transactions to be flushed because we don't
- * dirty the Linux inode until after the transaction I/O
- * completes.
+ * Not much we can do for the first async pass. Writing out the
+ * superblock would be counter-productive as we are going to redirty
+ * when writing out other data and metadata (and writing out a single
+ * block is quite fast anyway).
+ *
+ * Try to asynchronously kick off quota syncing at least.
*/
- if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
- error = xfs_quiesce_data(mp);
- else
- error = xfs_sync_fsdata(mp, 0);
+ if (!wait) {
+ xfs_qm_sync(mp, SYNC_TRYLOCK);
+ return 0;
+ }
+
+ error = xfs_quiesce_data(mp);
+ if (error)
+ return -error;
- if (unlikely(laptop_mode)) {
+ if (laptop_mode) {
int prev_sync_seq = mp->m_sync_seq;
/*
@@ -1165,7 +1230,7 @@ xfs_fs_sync_super(
mp->m_sync_seq != prev_sync_seq);
}
- return -error;
+ return 0;
}
STATIC int
@@ -1532,13 +1597,14 @@ xfs_fs_get_sb(
mnt);
}
-static struct super_operations xfs_super_operations = {
+static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
.write_inode = xfs_fs_write_inode,
.clear_inode = xfs_fs_clear_inode,
.put_super = xfs_fs_put_super,
- .sync_fs = xfs_fs_sync_super,
+ .sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
.statfs = xfs_fs_statfs,
.remount_fs = xfs_fs_remount,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
extern struct xattr_handler *xfs_xattr_handlers[];
-extern struct quotactl_ops xfs_quotactl_operations;
+extern const struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..d895a3a960f5 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
STATIC int
xfs_commit_dummy_trans(
struct xfs_mount *mp,
- uint log_flags)
+ uint flags)
{
struct xfs_inode *ip = mp->m_rootip;
struct xfs_trans *tp;
int error;
+ int log_flags = XFS_LOG_FORCE;
+
+ if (flags & SYNC_WAIT)
+ log_flags |= XFS_LOG_SYNC;
/*
* Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /* XXX(hch): ignoring the error here.. */
error = xfs_trans_commit(tp, 0);
-
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* the log force ensures this transaction is pushed to disk */
xfs_log_force(mp, 0, log_flags);
- return 0;
+ return error;
}
int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
else
XFS_BUF_ASYNC(bp);
- return xfs_bwrite(mp, bp);
+ error = xfs_bwrite(mp, bp);
+ if (error)
+ return error;
+
+ /*
+ * If this is a data integrity sync make sure all pending buffers
+ * are flushed out for the log coverage check below.
+ */
+ if (flags & SYNC_WAIT)
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+ if (xfs_log_need_covered(mp))
+ error = xfs_commit_dummy_trans(mp, flags);
+ return error;
out_brelse:
xfs_buf_relse(bp);
@@ -419,14 +435,16 @@ xfs_quiesce_data(
/* push non-blocking */
xfs_sync_data(mp, 0);
xfs_qm_sync(mp, SYNC_TRYLOCK);
- xfs_filestream_flush(mp);
- /* push and block */
+ /* push and block till complete */
xfs_sync_data(mp, SYNC_WAIT);
xfs_qm_sync(mp, SYNC_WAIT);
+ /* drop inode references pinned by filestreams */
+ xfs_filestream_flush(mp);
+
/* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp, 0);
+ error = xfs_sync_fsdata(mp, SYNC_WAIT);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -570,8 +588,6 @@ xfs_sync_worker(
/* dgc: errors ignored here */
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
- if (xfs_log_need_covered(mp))
- error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
@@ -647,10 +663,9 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-int
+STATIC int
xfs_reclaim_inode(
xfs_inode_t *ip,
- int locked,
int sync_mode)
{
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
@@ -666,10 +681,6 @@ xfs_reclaim_inode(
!__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
- if (locked) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
return -EAGAIN;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
@@ -688,10 +699,8 @@ xfs_reclaim_inode(
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
@@ -749,21 +758,6 @@ __xfs_inode_clear_reclaim_tag(
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
}
-void
-xfs_inode_clear_reclaim_tag(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
-
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-}
-
STATIC int
xfs_reclaim_inode_now(
struct xfs_inode *ip,
@@ -777,7 +771,7 @@ xfs_reclaim_inode_now(
}
read_unlock(&pag->pag_ici_lock);
- return xfs_reclaim_inode(ip, 0, flags);
+ return xfs_reclaim_inode(ip, flags);
}
int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..a500b4d91835 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,12 +44,10 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
xfs_stats_clear_proc_handler(
ctl_table *ctl,
int write,
- struct file *filp,
void __user *buffer,
size_t *lenp,
loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
- ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
printk("XFS Clearing xfsstats\n");
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
struct xqmstats xqmstats;
-STATIC int
-xfs_qm_read_xfsquota(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_show(struct seq_file *m, void *v)
{
- int len;
-
/* maximum; incore; ratio free to inuse; freelist */
- len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
ndquot,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+ return 0;
}
-STATIC int
-xfs_qm_read_stats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_open(struct inode *inode, struct file *file)
{
- int len;
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
/* quota performance statistics */
- len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+ seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
xqmstats.xs_qm_dqreclaims,
xqmstats.xs_qm_dqreclaim_misses,
xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
xqmstats.xs_qm_dqwants,
xqmstats.xs_qm_dqshake_reclaims,
xqmstats.xs_qm_dqinact_reclaims);
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
}
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
void
xfs_qm_init_procfs(void)
{
- create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
- create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+ proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+ proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
}
void
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 4e4276b956e8..5d1a3b98a6e6 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -876,7 +876,6 @@ xfs_dqrele_inode(
ip->i_gdquot = NULL;
}
xfs_iput(ip, XFS_ILOCK_EXCL);
- IRELE(ip);
return 0;
}
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
xfs_agino_t pagi_count; /* number of allocated inodes */
int pagb_count; /* pagb slots in use */
xfs_perag_busy_t *pagb_list; /* unstable blocks */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
spinlock_t pagb_lock; /* lock for pagb_list */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
* entry (null if none). Else, *lastxp will be set to the index
* of the found entry; *gotp will contain the entry.
*/
-xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_multi_extents(
xfs_ifork_t *ifp, /* inode fork pointer */
xfs_fileoff_t bno, /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
int whichfork,
int *count);
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
- xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
-
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
ext_flag);
}
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
- get_unaligned_be64(&r->l1), s);
-}
-
/*
* Extract the blockcount field from an on disk bmap extent record.
*/
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
*l1 = 0;
}
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+ xfs_bmbt_rec_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+ get_unaligned_be64(&r->l1), s);
+}
+
STATIC void
xfs_bmbt_trace_record(
struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
}
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for agno/agbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
- return error;
- }
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
- switch (refval) {
- case XFS_ALLOC_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- break;
- case XFS_INO_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
- break;
- }
- }
- *bpp = bp;
- return 0;
-}
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -2951,7 +2911,7 @@ error0:
* inode we have to copy the single block it was pointing to into the
* inode.
*/
-int
+STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
int refval);/* ref count value for buffer */
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- struct xfs_buf **bpp, /* buffer for agno/agbno */
- int refval);/* ref count value for buffer */
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9ee125f..ab89a7e94a0f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
- (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
- (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
- (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = XFS_ERROR(EBUSY);
goto out_unlock;
}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e459442..41ad537c49e9 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
*/
ra_want = howmany(bufsize + mp->m_dirblksize,
mp->m_sb.sb_blocksize) - 1;
+ ASSERT(ra_want >= 0);
/*
* If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
*/
ptr += length;
curoff += length;
- bufsize -= length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
}
/*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
#define BMV_IF_VALID \
(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
-/* bmv_oflags values - returned for for each non-header segment */
+/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e6..6f83f58c099f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
xfs_inode_t *ip;
int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..0785797db828 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
}
/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-STATIC int /* error */
-xfs_inobt_lookup_eq(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
int /* error */
-xfs_inobt_lookup_ge(
+xfs_inobt_lookup(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
+ xfs_lookup_t dir, /* <=, >=, == */
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
}
/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_le(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
+ * Update the record referred to by cur to the value given.
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
STATIC int /* error */
xfs_inobt_update(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free) /* free inode mask */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
{
union xfs_btree_rec rec;
- rec.inobt.ir_startino = cpu_to_be32(ino);
- rec.inobt.ir_freecount = cpu_to_be32(fcnt);
- rec.inobt.ir_free = cpu_to_be64(free);
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -135,9 +95,7 @@ xfs_inobt_update(
int /* error */
xfs_inobt_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t *ino, /* output: starting inode of chunk */
- __int32_t *fcnt, /* output: number of free inodes */
- xfs_inofree_t *free, /* output: free inode mask */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
int *stat) /* output: success/failure */
{
union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
error = xfs_btree_get_rec(cur, &rec, stat);
if (!error && *stat == 1) {
- *ino = be32_to_cpu(rec->inobt.ir_startino);
- *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
- *free = be64_to_cpu(rec->inobt.ir_free);
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
return error;
}
/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int blks_per_cluster, nbufs, ninodes;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+
+ /*
+ * Loop over the new block(s), filling in the inodes.
+ * For small block sizes, manipulate the inodes in buffers
+ * which are multiples of the blocks size.
+ */
+ if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+ blks_per_cluster = 1;
+ nbufs = length;
+ ninodes = mp->m_sb.sb_inopblock;
+ } else {
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+ mp->m_sb.sb_blocksize;
+ nbufs = length / blks_per_cluster;
+ ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+ }
+
+ /*
+ * Figure out what version number to use in the inodes we create.
+ * If the superblock version has caught up to the one that supports
+ * the new inode format, then use the new inode version. Otherwise
+ * use the old version so that old kernels will continue to be
+ * able to use the file system.
+ */
+ if (xfs_sb_version_hasnlink(&mp->m_sb))
+ version = 2;
+ else
+ version = 1;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XFS_BUF_LOCK);
+ ASSERT(fbuf);
+ ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+ /*
+ * Initialize all inodes in this buffer and then log them.
+ *
+ * XXX: It would be much better if we had just one transaction
+ * to log a whole cluster of inodes instead of all the
+ * individual transactions causing a lot of log traffic.
+ */
+ xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+ for (i = 0; i < ninodes; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = sizeof(struct xfs_dinode);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+ }
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ }
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- int blks_per_cluster; /* fs blocks per inode cluster */
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_daddr_t d; /* disk addr of buffer */
xfs_agnumber_t agno;
int error;
- xfs_buf_t *fbuf; /* new free inodes' buffer */
- xfs_dinode_t *free; /* new free inode structure */
- int i; /* inode counter */
- int j; /* block counter */
- int nbufs; /* num bufs of new inodes */
+ int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- int ninodes; /* num inodes per buf */
xfs_agino_t thisino; /* current inode number, for loop */
- int version; /* inode version number to use */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
- unsigned int gen;
args.tp = tp;
args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
*/
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
XFS_IALLOC_BLOCKS(args.mp);
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.mod = args.total = args.wasdel = args.isfl =
args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
return 0;
}
ASSERT(args.len == args.minlen);
- /*
- * Convert the results.
- */
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
- /*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
- */
- if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
- blks_per_cluster = 1;
- nbufs = (int)args.len;
- ninodes = args.mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
- args.mp->m_sb.sb_blocksize;
- nbufs = (int)args.len / blks_per_cluster;
- ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
- }
- /*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
- */
- if (xfs_sb_version_hasnlink(&args.mp->m_sb))
- version = 2;
- else
- version = 1;
/*
+ * Stamp and write the inode buffers.
+ *
* Seed the new inode cluster with a random generation number. This
* prevents short-term reuse of generation numbers if a chunk is
* freed and then immediately reallocated. We use random numbers
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- gen = random32();
- for (j = 0; j < nbufs; j++) {
- /*
- * Get the block.
- */
- d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno + (j * blks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
- args.mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
- ASSERT(fbuf);
- ASSERT(!XFS_BUF_GETERROR(fbuf));
+ xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+ random32());
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction to
- * log a whole cluster of inodes instead of all the individual
- * transactions causing a lot of log traffic.
- */
- xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
- for (i = 0; i < ninodes; i++) {
- int ioffset = i << args.mp->m_sb.sb_inodelog;
- uint isize = sizeof(struct xfs_dinode);
-
- free = xfs_make_iptr(args.mp, fbuf, i);
- free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- free->di_version = version;
- free->di_gen = cpu_to_be32(gen);
- free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
- }
- xfs_trans_inode_alloc_buf(tp, fbuf);
- }
+ /*
+ * Convert the results.
+ */
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- agno = be32_to_cpu(agi->agi_seqno);
down_read(&args.mp->m_peraglock);
args.mp->m_perag[agno].pagi_freecount += newlen;
up_read(&args.mp->m_peraglock);
agi->agi_newino = cpu_to_be32(newino);
+
/*
* Insert records describing the new inode chunk into the btree.
*/
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
for (thisino = newino;
thisino < newino + newlen;
thisino += XFS_INODES_PER_CHUNK) {
- if ((error = xfs_inobt_lookup_eq(cur, thisino,
- XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+ cur->bc_rec.i.ir_startino = thisino;
+ cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+ cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
ASSERT(i == 0);
- if ((error = xfs_btree_insert(cur, &i))) {
+ error = xfs_btree_insert(cur, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
@@ -539,6 +557,62 @@ nextag:
}
/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
+
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+/*
* Visible inode allocation functions.
*/
@@ -592,8 +666,8 @@ xfs_dialloc(
int j; /* result code */
xfs_mount_t *mp; /* file system mount structure */
int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's a.g. relative inode # */
- xfs_agnumber_t pagno; /* parent's allocation group number */
+ xfs_agino_t pagino; /* parent's AG relative inode # */
+ xfs_agnumber_t pagno; /* parent's AG number */
xfs_inobt_rec_incore_t rec; /* inode allocation record */
xfs_agnumber_t tagno; /* testing allocation group number */
xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
*/
agno = tagno;
*IO_agbp = NULL;
+
+ restart_pagno:
cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,200 @@ nextag:
*/
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
- * If in the same a.g. as the parent, try to get near the parent.
+ * If in the same AG as the parent, try to get near the parent.
*/
if (pagno == agno) {
- if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+ xfs_perag_t *pag = &mp->m_perag[agno];
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
goto error0;
- if (i != 0 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (rec.ir_freecount > 0) {
/*
* Found a free inode in the same chunk
- * as parent, done.
+ * as the parent, done.
*/
+ goto alloc_inode;
}
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
/*
- * In the same a.g. as parent, but parent's chunk is full.
+ * Skip to last blocks looked up if same parent inode.
*/
- else {
- int doneleft; /* done, to the left */
- int doneright; /* done, to the right */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright, 0);
if (error)
- goto error0;
- ASSERT(i == 1);
- ASSERT(j == 1);
- /*
- * Duplicate the cursor, search left & right
- * simultaneously.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- goto error0;
- /*
- * Search left with tcur, back up 1 record.
- */
- if ((error = xfs_btree_decrement(tcur, 0, &i)))
goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Search right with cur, go forward 1 record.
- */
- if ((error = xfs_btree_increment(cur, 0, &i)))
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Loop until we find the closest inode chunk
- * with a free one.
- */
- while (!doneleft || !doneright) {
- int useleft; /* using left inode
- chunk this time */
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
/*
- * Figure out which block is closer,
- * if both are valid.
- */
- if (!doneleft && !doneright)
- useleft =
- pagino -
- (trec.ir_startino +
- XFS_INODES_PER_CHUNK - 1) <
- rec.ir_startino - pagino;
- else
- useleft = !doneleft;
- /*
- * If checking the left, does it have
- * free inodes?
- */
- if (useleft && trec.ir_freecount) {
- /*
- * Yes, set it up as the chunk to use.
- */
- rec = trec;
- xfs_btree_del_cursor(cur,
- XFS_BTREE_NOERROR);
- cur = tcur;
- break;
- }
- /*
- * If checking the right, does it have
- * free inodes?
- */
- if (!useleft && rec.ir_freecount) {
- /*
- * Yes, it's already set up.
- */
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- break;
- }
- /*
- * If used the left, get another one
- * further left.
- */
- if (useleft) {
- if ((error = xfs_btree_decrement(tcur, 0,
- &i)))
- goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(
- tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
- /*
- * If used the right, get another one
- * further right.
+ * Not in range - save last search
+ * location and allocate a new inode
*/
- else {
- if ((error = xfs_btree_increment(cur, 0,
- &i)))
- goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(
- cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
}
- ASSERT(!doneleft || !doneright);
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
}
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
}
+
/*
- * In a different a.g. from the parent.
+ * In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
- else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
- if ((error = xfs_inobt_lookup_eq(cur,
- be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+newino:
+ if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
goto error0;
- if (i == 1 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
- /*
- * The last chunk allocated in the group still has
- * a free inode.
- */
- }
- /*
- * None left in the last group, search the whole a.g.
- */
- else {
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- ASSERT(i == 1);
- for (;;) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (rec.ir_freecount > 0)
- break;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
}
}
}
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
offset = xfs_ialloc_find_free(&rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1001,19 @@ nextag:
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
- rec.ir_free)))
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
goto error0;
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
mp->m_perag[tagno].pagi_freecount--;
up_read(&mp->m_peraglock);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
*inop = ino;
@@ -1062,38 +1104,23 @@ xfs_difree(
* Initialize the cursor.
*/
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
/*
* Look for the entry describing this inode.
*/
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
error, mp->m_fsname);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
- &rec.ir_free, &i))) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
cmn_err(CE_WARN,
"xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
error, mp->m_fsname);
@@ -1148,12 +1175,14 @@ xfs_difree(
} else {
*delete = 0;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_update returned an error %d on %s.",
error, mp->m_fsname);
goto error0;
}
+
/*
* Change the inode free counts and log the ag/sb changes.
*/
@@ -1165,28 +1194,10 @@ xfs_difree(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1297,9 +1308,7 @@ xfs_imap(
chunk_agbno = agbno - offset_agbno;
} else {
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
+ xfs_inobt_rec_incore_t chunk_rec;
xfs_buf_t *agbp; /* agi buffer */
int i; /* temp state */
@@ -1315,15 +1324,14 @@ xfs_imap(
}
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_lookup_le() failed");
+ "xfs_inobt_lookup() failed");
goto error0;
}
- error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i);
+ error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_get_rec() failed");
@@ -1341,7 +1349,7 @@ xfs_imap(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+ chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
offset_agbno = agbno - chunk_agbno;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
xfs_agnumber_t agno); /* allocation group number */
/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
/*
* Get the data from the pointed-to record.
*/
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
- __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..cc72c561ff52 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -73,6 +73,9 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
/* initialise the xfs inode */
ip->i_ino = ino;
@@ -82,7 +85,6 @@ xfs_inode_alloc(
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_update_core = 0;
- ip->i_update_size = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
ip->i_size = 0;
@@ -456,32 +458,6 @@ out_error_or_again:
return error;
}
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t *mp,
- xfs_ino_t ino,
- xfs_trans_t *tp)
-{
- xfs_inode_t *ip;
- xfs_perag_t *pag;
-
- pag = xfs_get_perag(mp, ino);
- read_lock(&pag->pag_ici_lock);
- ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-
- /* the returned inode must match the transaction */
- if (ip && (ip->i_transp != tp))
- return NULL;
- return ip;
-}
-
/*
* Decrement reference count of an inode structure and unlock it.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..b92a4fa2a0a1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
return 0;
}
-void
+STATIC void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
xfs_dinode_t *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
* In that case the pages will still be in memory, but the inode size
* will never have been updated.
*/
-xfs_fsize_t
+STATIC xfs_fsize_t
xfs_file_last_byte(
xfs_inode_t *ip)
{
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
SYNCHRONIZE();
/*
- * Make sure to get the latest atime from the Linux inode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- xfs_synchronize_atime(ip);
+ xfs_synchronize_times(ip);
if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
/*
* Resize an extent indirection array to new_size bytes.
*/
-void
+STATIC void
xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
/*
* Switch from indirection array to linear (direct) extent allocations.
*/
-void
+STATIC void
xfs_iext_indirect_to_direct(
xfs_ifork_t *ifp) /* inode fork pointer */
{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..41555de1d1db 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
/* Miscellaneous state. */
unsigned short i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
- unsigned char i_update_size; /* di_size field is dirty */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/*
* xfs_iget.c prototypes.
*/
-xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
- struct xfs_trans *);
int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
uint, uint, xfs_inode_t **, xfs_daddr_t);
void xfs_iput(xfs_inode_t *, uint);
@@ -504,11 +501,10 @@ void xfs_ipin(xfs_inode_t *);
void xfs_iunpin(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
void xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void xfs_synchronize_atime(xfs_inode_t *);
+void xfs_synchronize_times(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
#if defined(XFS_INODE_TRACE)
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
struct xfs_buf **, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, xfs_daddr_t, uint);
-void xfs_dinode_from_disk(struct xfs_icdinode *,
- struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void xfs_iext_indirect_to_direct(xfs_ifork_t *);
void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..9794b876d6ff 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
nvecs = 1;
/*
+ * Make sure the linux inode is dirty. We do this before
+ * clearing i_update_core as the VFS will call back into
+ * XFS here and set i_update_core, so we need to dirty the
+ * inode first so that the ordering of i_update_core and
+ * unlogged modifications still works as described below.
+ */
+ xfs_mark_inode_dirty_sync(ip);
+
+ /*
* Clear i_update_core if the timestamps (or any other
* non-transactional modification) need flushing/logging
* and we're about to log them with the rest of the core.
@@ -263,22 +272,9 @@ xfs_inode_item_format(
}
/*
- * We don't have to worry about re-ordering here because
- * the update_size field is protected by the inode lock
- * and we have that held in exclusive mode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- if (ip->i_update_size)
- ip->i_update_size = 0;
-
- /*
- * Make sure to get the latest atime from the Linux inode.
- */
- xfs_synchronize_atime(ip);
-
- /*
- * make sure the linux inode is dirty
- */
- xfs_mark_inode_dirty_sync(ip);
+ xfs_synchronize_times(ip);
vecp->i_addr = (xfs_caddr_t)&ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
@@ -712,8 +708,6 @@ xfs_inode_item_unlock(
* Clear out the fields of the inode log item particular
* to the current transaction.
*/
- iip->ili_ilock_recur = 0;
- iip->ili_iolock_recur = 0;
iip->ili_flags = 0;
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_ilock_recur; /* lock recursion count */
- unsigned short ili_iolock_recur; /* lock recursion count */
unsigned short ili_flags; /* misc flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
#if XFS_BIG_INUMS
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
#else
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
#endif
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30a..7294abce6ef2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
* set up a transaction to convert the range of extents
* from unwritten to real. Do allocations in a loop until
* we have covered the range passed in.
+ *
+ * Note that we open code the transaction allocation here
+ * to pass KM_NOFS--we can't risk to recursing back into
+ * the filesystem here as we might be asked to write out
+ * the same inode that we complete here and might deadlock
+ * on the iolock.
*/
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..62efab2f3839 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
#include "xfs_error.h"
#include "xfs_btree.h"
-int
+STATIC int
xfs_internal_inum(
xfs_mount_t *mp,
xfs_ino_t ino)
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
{
xfs_icdinode_t *dic; /* dinode core info pointer */
xfs_inode_t *ip; /* incore inode pointer */
+ struct inode *inode;
int error;
error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
+ inode = VFS_I(ip);
/* xfs_iget returns the following without needing
* further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
+
/*
- * We are reading the atime from the Linux inode because the
- * dinode might not be uptodate.
+ * We need to read the timestamps from the Linux inode because
+ * the VFS keeps writing directly into the inode structure instead
+ * of telling us about the updates.
*/
- buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
- buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
- buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
- buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
@@ -353,9 +358,6 @@ xfs_bulkstat(
int end_of_ag; /* set if we've seen the ag end */
int error; /* error code */
int fmterror;/* bulkstat formatter result */
- __int32_t gcnt; /* current btree rec's count */
- xfs_inofree_t gfree; /* current btree rec's free mask */
- xfs_agino_t gino; /* current btree rec's start inode */
int i; /* loop index */
int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +444,43 @@ xfs_bulkstat(
* we need to get the remainder of the chunk we're in.
*/
if (agino > 0) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Lookup the inode chunk that this inode lives in.
*/
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+ &tmp);
if (!error && /* no I/O error */
tmp && /* lookup succeeded */
/* got the record, should always work */
- !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) &&
+ !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
i == 1 &&
/* this is the right chunk */
- agino < gino + XFS_INODES_PER_CHUNK &&
+ agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
/* lastino was not last in chunk */
- (chunkidx = agino - gino + 1) <
+ (chunkidx = agino - r.ir_startino + 1) <
XFS_INODES_PER_CHUNK &&
/* there are some left allocated */
xfs_inobt_maskn(chunkidx,
- XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+ XFS_INODES_PER_CHUNK - chunkidx) &
+ ~r.ir_free) {
/*
* Grab the chunk record. Mark all the
* uninteresting inodes (because they're
* before our start point) free.
*/
for (i = 0; i < chunkidx; i++) {
- if (XFS_INOBT_MASK(i) & ~gfree)
- gcnt++;
+ if (XFS_INOBT_MASK(i) & ~r.ir_free)
+ r.ir_freecount++;
}
- gfree |= xfs_inobt_maskn(0, chunkidx);
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- agino = gino + XFS_INODES_PER_CHUNK;
- icount = XFS_INODES_PER_CHUNK - gcnt;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+ icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
} else {
/*
* If any of those tests failed, bump the
@@ -493,7 +498,7 @@ xfs_bulkstat(
/*
* Start of ag. Lookup the first inode chunk.
*/
- error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
icount = 0;
}
/*
@@ -501,6 +506,8 @@ xfs_bulkstat(
* until we run out of inodes or space in the buffer.
*/
while (irbp < irbufend && icount < ubcount) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Loop as long as we're unable to read the
* inode btree.
@@ -510,51 +517,55 @@ xfs_bulkstat(
if (XFS_AGINO_TO_AGBNO(mp, agino) >=
be32_to_cpu(agi->agi_length))
break;
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
- &tmp);
+ error = xfs_inobt_lookup(cur, agino,
+ XFS_LOOKUP_GE, &tmp);
cond_resched();
}
/*
* If ran off the end of the ag either with an error,
* or the normal way, set end and stop collecting.
*/
- if (error ||
- (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) ||
- i == 0) {
+ if (error) {
+ end_of_ag = 1;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
end_of_ag = 1;
break;
}
+
/*
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (gcnt < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
- for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
- chunkidx = 0;
+ agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+ for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
chunkidx += nicluster,
agbno += nbcluster) {
- if (xfs_inobt_maskn(chunkidx,
- nicluster) & ~gfree)
+ if (xfs_inobt_maskn(chunkidx, nicluster)
+ & ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
agbno, nbcluster);
}
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - gcnt;
+ icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
}
/*
* Set agino to after this chunk and bump the cursor.
*/
- agino = gino + XFS_INODES_PER_CHUNK;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
error = xfs_btree_increment(cur, 0, &tmp);
cond_resched();
}
@@ -820,9 +831,7 @@ xfs_inumbers(
int bufidx;
xfs_btree_cur_t *cur;
int error;
- __int32_t gcnt;
- xfs_inofree_t gfree;
- xfs_agino_t gino;
+ xfs_inobt_rec_incore_t r;
int i;
xfs_ino_t ino;
int left;
@@ -855,7 +864,8 @@ xfs_inumbers(
continue;
}
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+ &tmp);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
cur = NULL;
@@ -870,9 +880,8 @@ xfs_inumbers(
continue;
}
}
- if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
- &i)) ||
- i == 0) {
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
xfs_buf_relse(agbp);
agbp = NULL;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +890,12 @@ xfs_inumbers(
agino = 0;
continue;
}
- agino = gino + XFS_INODES_PER_CHUNK - 1;
- buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
- buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
- buffer[bufidx].xi_allocmask = ~gfree;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+ buffer[bufidx].xi_startino =
+ XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+ buffer[bufidx].xi_alloccount =
+ XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_allocmask = ~r.ir_free;
bufidx++;
left--;
if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
void *dibuff,
int *stat);
-int
-xfs_internal_inum(
- xfs_mount_t *mp,
- xfs_ino_t ino);
-
typedef int (*inumbers_fmt_pf)(
void __user *ubuffer, /* buffer to write to */
const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void xlog_recover_process_iunlinks(xlog_t *log);
-
extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
extern void xlog_put_bp(struct xfs_buf *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..b5b0d8055910 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer(
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
+ if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
cmn_err(CE_ALERT,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[i].i_len, __func__);
@@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans(
"XFS: NULL dquot in %s.", __func__);
return XFS_ERROR(EIO);
}
- if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
+ if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
cmn_err(CE_ALERT,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
* freeing of the inode and its removal from the list must be
* atomic.
*/
-void
+STATIC void
xlog_recover_process_iunlinks(
xlog_t *log)
{
@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t bufaddr, offset;
+ xfs_caddr_t offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
/*
* Check for header wrapping around physical end-of-log
*/
- offset = NULL;
+ offset = XFS_BUF_PTR(hbp);
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- bufaddr = XFS_BUF_PTR(hbp);
error = XFS_BUF_SET_PTR(hbp,
- bufaddr + BBTOB(split_hblks),
+ offset + BBTOB(split_hblks),
BBTOB(hblks - split_hblks));
if (error)
goto bread_err2;
@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(hbp, bufaddr,
+ error = XFS_BUF_SET_PTR(hbp, offset,
BBTOB(hblks));
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, 0,
- wrapped_hblks, hbp);
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
} else {
/* This log record is split across the
* physical end of log */
- offset = NULL;
+ offset = XFS_BUF_PTR(dbp);
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- bufaddr = XFS_BUF_PTR(dbp);
error = XFS_BUF_SET_PTR(dbp,
- bufaddr + BBTOB(split_bblks),
+ offset + BBTOB(split_bblks),
BBTOB(bblks - split_bblks));
if (error)
goto bread_err2;
@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+ error = XFS_BUF_SET_PTR(dbp, offset, h_size);
if (error)
goto bread_err2;
-
- if (!offset)
- offset = xlog_align(log, wrapped_hblks,
- bblks - split_bblks, dbp);
}
xlog_unpack_data(rhead, offset, log);
if ((error = xlog_recover_process_data(log, rhash,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..4d509f742bd2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return 0;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
XFS_DEFAULT_LOG_COUNT);
if (error) {
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
*
* The m_sb_lock must be held when this routine is called.
*/
-int
+STATIC int
xfs_mod_incore_sb_unlocked(
xfs_mount_t *mp,
xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
extern int xfs_log_sbcount(xfs_mount_t *, uint);
extern int xfs_mountfs(xfs_mount_t *mp);
-extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
- int64_t, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
}
/*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
- xfs_mru_cache_t *mru,
- unsigned long key)
-{
- xfs_mru_cache_elem_t *elem;
-
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- return NULL;
-
- spin_lock(&mru->lock);
- elem = radix_tree_lookup(&mru->store, key);
- if (!elem)
- spin_unlock(&mru->lock);
- else
- __release(mru_lock); /* help sparse not be stupid */
-
- return elem ? elem->value : NULL;
-}
-
-/*
* To release the internal data structure spinlock after having performed an
* xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
* with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
}
/*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
- xfs_mount_t *mp,
- xfs_inode_t *ip)
-{
- int error = 0;
-
- /*
- * If we're treating this as O_DSYNC and we have not updated the
- * size, force the log.
- */
- if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
- !(ip->i_update_size)) {
- xfs_inode_log_item_t *iip = ip->i_itemp;
-
- /*
- * If an allocation transaction occurred
- * without extending the size, then we have to force
- * the log up the proper point to ensure that the
- * allocation is permanent. We can't count on
- * the fact that buffered writes lock out direct I/O
- * writes - the direct I/O write could have extended
- * the size nontransactionally, then finished before
- * we started. xfs_write_file will think that the file
- * didn't grow but the update isn't safe unless the
- * size change is logged.
- *
- * Force the log if we've committed a transaction
- * against the inode or if someone else has and
- * the commit record hasn't gone to disk (e.g.
- * the inode is pinned). This guarantees that
- * all changes affecting the inode are permanent
- * when we return.
- */
- if (iip && iip->ili_last_lsn) {
- error = _xfs_log_force(mp, iip->ili_last_lsn,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- } else if (xfs_ipincount(ip) > 0) {
- error = _xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- }
-
- } else {
- xfs_trans_t *tp;
-
- /*
- * O_SYNC or O_DSYNC _with_ a size update are handled
- * the same way.
- *
- * If the write was synchronous then we need to make
- * sure that the inode modification time is permanent.
- * We'll have updated the timestamp above, so here
- * we use a synchronous transaction to log the inode.
- * It's not fast, but it's necessary.
- *
- * If this a dsync write and the size got changed
- * non-transactionally, then we need to ensure that
- * the size change gets logged in a synchronous
- * transaction.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_SWRITE_LOG_RES(mp),
- 0, 0, 0))) {
- /* Transaction reserve failed */
- xfs_trans_cancel(tp, 0);
- } else {
- /* Transaction reserve successful */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- }
-
- return error;
-}
-
-/*
* Force a shutdown of the filesystem instantly while keeping
* the filesystem consistent. We don't do an unmount here; just shutdown
* the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..726014d1c925 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_LOCK (1<<0)
-#define XFS_FREE_EOF_NOLOCK (1<<1)
-
-
-/*
* helper function to extract extent size hint from inode
*/
STATIC_INLINE xfs_extlen_t
@@ -68,7 +61,6 @@ xfs_get_extsz_hint(
* Prototypes for functions in xfs_rw.c.
*/
extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
extern int xfs_bioerror(struct xfs_buf *bp);
extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +70,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
xfs_buf_t *bp, xfs_daddr_t blkno);
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
- int flags);
-
#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e62..237badcbac3b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
uint type)
{
xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
- return _xfs_trans_alloc(mp, type);
+ return _xfs_trans_alloc(mp, type, KM_SLEEP);
}
xfs_trans_t *
_xfs_trans_alloc(
xfs_mount_t *mp,
- uint type)
+ uint type,
+ uint memflags)
{
xfs_trans_t *tp;
atomic_inc(&mp->m_active_trans);
- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
tp->t_magic = XFS_TRANS_MAGIC;
tp->t_type = type;
tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..a0574f593f52 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_GROWFS 14
#define XFS_TRANS_STRAT_WRITE 15
#define XFS_TRANS_DIOSTRAT 16
-#define XFS_TRANS_WRITE_SYNC 17
+/* 17 was XFS_TRANS_WRITE_SYNC */
#define XFS_TRANS_WRITEID 18
#define XFS_TRANS_ADDAFORK 19
#define XFS_TRANS_ATTRINVAL 20
@@ -924,7 +924,7 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces.
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f31271c30de9..2ffc570679be 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -467,6 +467,7 @@ xfs_trans_ail_update(
{
xfs_log_item_t *dlip = NULL;
xfs_log_item_t *mlip; /* ptr to minimum lip */
+ xfs_lsn_t tail_lsn;
mlip = xfs_ail_min(ailp);
@@ -483,8 +484,16 @@ xfs_trans_ail_update(
if (mlip == dlip) {
mlip = xfs_ail_min(ailp);
+ /*
+ * It is not safe to access mlip after the AIL lock is
+ * dropped, so we must get a copy of li_lsn before we do
+ * so. This is especially important on 32-bit platforms
+ * where accessing and updating 64-bit values like li_lsn
+ * is not atomic.
+ */
+ tail_lsn = mlip->li_lsn;
spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
} else {
spin_unlock(&ailp->xa_lock);
}
@@ -514,6 +523,7 @@ xfs_trans_ail_delete(
{
xfs_log_item_t *dlip;
xfs_log_item_t *mlip;
+ xfs_lsn_t tail_lsn;
if (lip->li_flags & XFS_LI_IN_AIL) {
mlip = xfs_ail_min(ailp);
@@ -527,9 +537,16 @@ xfs_trans_ail_delete(
if (mlip == dlip) {
mlip = xfs_ail_min(ailp);
+ /*
+ * It is not safe to access mlip after the AIL lock
+ * is dropped, so we must get a copy of li_lsn
+ * before we do so. This is especially important
+ * on 32-bit platforms where accessing and updating
+ * 64-bit values like li_lsn is not atomic.
+ */
+ tail_lsn = mlip ? mlip->li_lsn : 0;
spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount,
- (mlip ? mlip->li_lsn : 0));
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
} else {
spin_unlock(&ailp->xa_lock);
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
return (flags & XFS_BUF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
- if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+ if (XFS_BUF_GETERROR(bp) != 0) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
bp, blkno);
error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
return error;
}
#ifdef DEBUG
- if (xfs_do_error && (bp != NULL)) {
+ if (xfs_do_error) {
if (xfs_error_target == target) {
if (((xfs_req_num++) % xfs_error_mod) == 0) {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
/*
- * Get and lock the inode for the caller if it is not already
- * locked within the given transaction. If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively. This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength. That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa. Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
+ * Get an inode and join it to the transaction.
*/
int
xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
xfs_inode_t **ipp)
{
int error;
- xfs_inode_t *ip;
-
- /*
- * If the transaction pointer is NULL, just call the normal
- * xfs_iget().
- */
- if (tp == NULL)
- return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-
- /*
- * If we find the inode in core with this transaction
- * pointer in its i_transp field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the inode to the caller.
- * Assert that the inode is already locked in the mode requested
- * by the caller. We cannot do lock promotions yet, so
- * die if someone gets this wrong.
- */
- if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
- /*
- * Make sure that the inode lock is held EXCL and
- * that the io lock is never upgraded when the inode
- * is already a part of the transaction.
- */
- ASSERT(ip->i_itemp != NULL);
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-
- if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
- ip->i_itemp->ili_iolock_recur++;
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- ip->i_itemp->ili_ilock_recur++;
- }
- *ipp = ip;
- return 0;
- }
-
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
- if (error) {
- return error;
- }
- ASSERT(ip != NULL);
- xfs_trans_ijoin(tp, ip, lock_flags);
- *ipp = ip;
- return 0;
+ error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+ if (!error && tp)
+ xfs_trans_ijoin(tp, *ipp, lock_flags);
+ return error;
}
/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
ASSERT(iip->ili_flags == 0);
- ASSERT(iip->ili_ilock_recur == 0);
- ASSERT(iip->ili_iolock_recur == 0);
/*
* Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..d98401470cf0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
xfs_inode_t *ip)
{
xfs_trans_t *tp;
- int error;
+ int error = 0;
int log_flushed = 0, changed = 1;
xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return XFS_ERROR(EIO);
- /* capture size updates in I/O completion before writing the inode. */
- error = xfs_wait_on_pages(ip, 0, -1);
- if (error)
- return XFS_ERROR(error);
-
/*
* We always need to make sure that the required inode state is safe on
- * disk. The vnode might be clean but we still might need to force the
+ * disk. The inode might be clean but we still might need to force the
* log because of committed transactions that haven't hit the disk yet.
* Likewise, there could be unflushed non-transactional changes to the
* inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!(ip->i_update_size || ip->i_update_core)) {
+ if (!ip->i_update_core) {
/*
* Timestamps/size haven't changed since last inode flush or
* inode transaction commit. That means either nothing got
@@ -714,11 +709,16 @@ xfs_fsync(
}
/*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK (1<<0)
+
+/*
* This is called by xfs_inactive to free any blocks beyond eof
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
*/
-int
+STATIC int
xfs_free_eofblocks(
xfs_mount_t *mp,
xfs_inode_t *ip,
@@ -731,7 +731,6 @@ xfs_free_eofblocks(
xfs_filblks_t map_len;
int nimaps;
xfs_bmbt_irec_t imap;
- int use_iolock = (flags & XFS_FREE_EOF_LOCK);
/*
* Figure out if there are any blocks beyond the end
@@ -773,14 +772,19 @@ xfs_free_eofblocks(
* cache and we can't
* do that within a transaction.
*/
- if (use_iolock)
+ if (flags & XFS_FREE_EOF_TRYLOCK) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+ } else {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ }
error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
ip->i_size);
if (error) {
xfs_trans_cancel(tp, 0);
- if (use_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
}
@@ -817,8 +821,7 @@ xfs_free_eofblocks(
error = xfs_trans_commit(tp,
XFS_TRANS_RELEASE_LOG_RES);
}
- xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
- : XFS_ILOCK_EXCL));
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
}
return error;
}
@@ -1118,7 +1121,17 @@ xfs_release(
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+
+ /*
+ * If we can't get the iolock just skip truncating
+ * the blocks past EOF because we could deadlock
+ * with the mmap_sem otherwise. We'll get another
+ * chance to drop them once the last reference to
+ * the inode is dropped, so we'll never leak blocks
+ * permanently.
+ */
+ error = xfs_free_eofblocks(mp, ip,
+ XFS_FREE_EOF_TRYLOCK);
if (error)
return error;
}
@@ -1189,7 +1202,7 @@ xfs_inactive(
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
(ip->i_delayed_blks != 0)))) {
- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+ error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
}
@@ -1476,8 +1489,8 @@ xfs_create(
if (error == ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(dp);
- error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
}
if (error == ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
@@ -2461,52 +2474,6 @@ xfs_set_dmattrs(
return error;
}
-int
-xfs_reclaim(
- xfs_inode_t *ip)
-{
-
- xfs_itrace_entry(ip);
-
- ASSERT(!VN_MAPPED(VFS_I(ip)));
-
- /* bad inode, get out here ASAP */
- if (is_bad_inode(VFS_I(ip))) {
- xfs_ireclaim(ip);
- return 0;
- }
-
- xfs_ioend_wait(ip);
-
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
- /*
- * Make sure the atime in the XFS inode is correct before freeing the
- * Linux inode.
- */
- xfs_synchronize_atime(ip);
-
- /*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise break the link between the xfs inode and the
- * linux inode and clean up the xfs inode later. This avoids flushing
- * the inode to disk during the delete operation itself.
- *
- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
- * first to ensure that xfs_iunpin() will never see an xfs inode
- * that has a linux inode being reclaimed. Synchronisation is provided
- * by the i_flags_lock.
- */
- if (!ip->i_update_core && (ip->i_itemp == NULL)) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
- }
- xfs_inode_set_reclaim_tag(ip);
- return 0;
-}
-
/*
* xfs_alloc_file_space()
* This routine allocates disk space for the given file.
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102de71a1..167a467403a5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, mode_t mode, struct xfs_inode **ipp,
cred_t *credp);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,